wayfarer 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +8 -6
  4. data/.github/workflows/release.yaml +4 -3
  5. data/.github/workflows/tests.yaml +5 -14
  6. data/.gitignore +2 -2
  7. data/.rubocop.yml +31 -0
  8. data/.vale.ini +6 -3
  9. data/Dockerfile +3 -2
  10. data/Gemfile +21 -0
  11. data/Gemfile.lock +233 -128
  12. data/Rakefile +7 -0
  13. data/docker-compose.yml +13 -14
  14. data/docs/guides/callbacks.md +3 -1
  15. data/docs/guides/configuration.md +10 -35
  16. data/docs/guides/development.md +67 -0
  17. data/docs/guides/handlers.md +7 -7
  18. data/docs/guides/jobs.md +54 -11
  19. data/docs/guides/networking/custom_adapters.md +31 -10
  20. data/docs/guides/pages.md +24 -22
  21. data/docs/guides/routing.md +116 -34
  22. data/docs/guides/tasks.md +30 -10
  23. data/docs/guides/tutorial.md +23 -17
  24. data/docs/guides/user_agents.md +11 -9
  25. data/lib/wayfarer/base.rb +9 -8
  26. data/lib/wayfarer/batch_completion.rb +18 -14
  27. data/lib/wayfarer/callbacks.rb +14 -14
  28. data/lib/wayfarer/cli/route_printer.rb +78 -96
  29. data/lib/wayfarer/cli.rb +12 -30
  30. data/lib/wayfarer/gc.rb +6 -1
  31. data/lib/wayfarer/kv.rb +28 -0
  32. data/lib/wayfarer/middleware/chain.rb +7 -1
  33. data/lib/wayfarer/middleware/content_type.rb +20 -15
  34. data/lib/wayfarer/middleware/dedup.rb +9 -3
  35. data/lib/wayfarer/middleware/dispatch.rb +7 -2
  36. data/lib/wayfarer/middleware/normalize.rb +4 -12
  37. data/lib/wayfarer/middleware/router.rb +1 -1
  38. data/lib/wayfarer/middleware/uri_parser.rb +4 -3
  39. data/lib/wayfarer/networking/context.rb +12 -1
  40. data/lib/wayfarer/networking/ferrum.rb +1 -4
  41. data/lib/wayfarer/networking/follow.rb +2 -1
  42. data/lib/wayfarer/networking/pool.rb +12 -7
  43. data/lib/wayfarer/networking/selenium.rb +15 -7
  44. data/lib/wayfarer/page.rb +0 -2
  45. data/lib/wayfarer/parsing/xml.rb +1 -1
  46. data/lib/wayfarer/parsing.rb +2 -5
  47. data/lib/wayfarer/redis/barrier.rb +15 -2
  48. data/lib/wayfarer/redis/counter.rb +1 -2
  49. data/lib/wayfarer/routing/dsl.rb +166 -31
  50. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  51. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  52. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  53. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  54. data/lib/wayfarer/routing/matchers/path.rb +11 -33
  55. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  56. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  57. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  58. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  59. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  60. data/lib/wayfarer/routing/path_finder.rb +151 -23
  61. data/lib/wayfarer/routing/result.rb +1 -1
  62. data/lib/wayfarer/routing/root_route.rb +14 -2
  63. data/lib/wayfarer/routing/route.rb +71 -14
  64. data/lib/wayfarer/routing/serializable.rb +28 -0
  65. data/lib/wayfarer/routing/sub_route.rb +53 -0
  66. data/lib/wayfarer/routing/target_route.rb +17 -1
  67. data/lib/wayfarer/stringify.rb +1 -2
  68. data/lib/wayfarer/task.rb +3 -5
  69. data/lib/wayfarer/uri/normalization.rb +120 -0
  70. data/lib/wayfarer.rb +50 -10
  71. data/mise.toml +2 -0
  72. data/mkdocs.yml +8 -17
  73. data/rake/lint.rake +0 -96
  74. data/rake/release.rake +5 -11
  75. data/rake/tests.rake +8 -4
  76. data/requirements.txt +1 -1
  77. data/spec/factories/job.rb +8 -0
  78. data/spec/factories/middleware.rb +2 -2
  79. data/spec/factories/path_finder.rb +11 -0
  80. data/spec/factories/redis.rb +19 -0
  81. data/spec/factories/task.rb +39 -1
  82. data/spec/spec_helpers.rb +50 -57
  83. data/spec/support/active_job_helpers.rb +8 -0
  84. data/spec/support/integration_helpers.rb +21 -0
  85. data/spec/support/redis_helpers.rb +9 -0
  86. data/spec/support/test_app.rb +64 -43
  87. data/spec/{base_spec.rb → wayfarer/base_spec.rb} +32 -36
  88. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  89. data/spec/wayfarer/cli/job_spec.rb +88 -0
  90. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  91. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  92. data/spec/wayfarer/gc_spec.rb +29 -0
  93. data/spec/{handler_spec.rb → wayfarer/handler_spec.rb} +1 -3
  94. data/spec/{integration → wayfarer/integration}/callbacks_spec.rb +9 -6
  95. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  96. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  97. data/spec/{integration → wayfarer/integration}/gc_spec.rb +9 -13
  98. data/spec/{integration → wayfarer/integration}/handler_spec.rb +9 -10
  99. data/spec/{integration → wayfarer/integration}/page_spec.rb +8 -6
  100. data/spec/{integration → wayfarer/integration}/params_spec.rb +4 -4
  101. data/spec/{integration → wayfarer/integration}/parsing_spec.rb +7 -33
  102. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  103. data/spec/{integration → wayfarer/integration}/stage_spec.rb +5 -5
  104. data/spec/{middleware → wayfarer/middleware}/batch_completion_spec.rb +4 -5
  105. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +20 -15
  106. data/spec/{middleware → wayfarer/middleware}/content_type_spec.rb +18 -21
  107. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +22 -20
  108. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  109. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  110. data/spec/{middleware → wayfarer/middleware}/router_spec.rb +18 -20
  111. data/spec/{middleware → wayfarer/middleware}/stage_spec.rb +11 -10
  112. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  113. data/spec/{middleware → wayfarer/middleware}/user_agent_spec.rb +34 -32
  114. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  115. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  116. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  117. data/spec/{networking → wayfarer/networking}/follow_spec.rb +9 -4
  118. data/spec/wayfarer/networking/http_spec.rb +12 -0
  119. data/spec/{networking → wayfarer/networking}/pool_spec.rb +11 -9
  120. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  121. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  122. data/spec/{page_spec.rb → wayfarer/page_spec.rb} +3 -3
  123. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  124. data/spec/{parsing/xml_spec.rb → wayfarer/parsing/xml_parse_spec.rb} +4 -3
  125. data/spec/{redis → wayfarer/redis}/barrier_spec.rb +5 -4
  126. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  127. data/spec/{redis → wayfarer/redis}/pool_spec.rb +3 -2
  128. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  129. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  130. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  131. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  132. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  133. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  134. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  135. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  136. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  137. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  138. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  139. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  140. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  141. data/spec/wayfarer/routing/route_spec.rb +74 -0
  142. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  143. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  144. data/spec/wayfarer_spec.rb +2 -2
  145. data/wayfarer.gemspec +17 -28
  146. metadata +768 -246
  147. data/.rbenv-gemsets +0 -1
  148. data/.ruby-version +0 -1
  149. data/RELEASING.md +0 -17
  150. data/docs/cookbook/user_agent.md +0 -7
  151. data/docs/design.md +0 -36
  152. data/docs/guides/jobs/error_handling.md +0 -40
  153. data/docs/reference/configuration.md +0 -36
  154. data/spec/batch_completion_spec.rb +0 -104
  155. data/spec/cli/job_spec.rb +0 -74
  156. data/spec/cli/routing_spec.rb +0 -101
  157. data/spec/fixtures/dummy_job.rb +0 -9
  158. data/spec/gc_spec.rb +0 -17
  159. data/spec/integration/content_type_spec.rb +0 -145
  160. data/spec/integration/routing_spec.rb +0 -18
  161. data/spec/middleware/dedup_spec.rb +0 -71
  162. data/spec/middleware/dispatch_spec.rb +0 -59
  163. data/spec/middleware/normalize_spec.rb +0 -60
  164. data/spec/middleware/uri_parser_spec.rb +0 -53
  165. data/spec/networking/capybara_spec.rb +0 -12
  166. data/spec/networking/ferrum_spec.rb +0 -12
  167. data/spec/networking/http_spec.rb +0 -12
  168. data/spec/networking/selenium_spec.rb +0 -12
  169. data/spec/redis/counter_spec.rb +0 -44
  170. data/spec/routing/integration_spec.rb +0 -110
  171. data/spec/routing/matchers/custom_spec.rb +0 -31
  172. data/spec/routing/matchers/host_spec.rb +0 -49
  173. data/spec/routing/matchers/path_spec.rb +0 -43
  174. data/spec/routing/matchers/query_spec.rb +0 -137
  175. data/spec/routing/matchers/scheme_spec.rb +0 -25
  176. data/spec/routing/matchers/suffix_spec.rb +0 -41
  177. data/spec/routing/matchers/uri_spec.rb +0 -27
  178. data/spec/routing/path_finder_spec.rb +0 -33
  179. data/spec/routing/root_route_spec.rb +0 -29
  180. data/spec/routing/route_spec.rb +0 -43
  181. data/docs/{reference → guides}/cli.md +0 -0
  182. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
  183. /data/spec/{task_spec.rb → wayfarer/task_spec.rb} +0 -0
@@ -1,14 +1,13 @@
1
1
  # Tutorial
2
2
 
3
3
  Wayfarer is a web crawling framework written in Ruby.
4
- It works with plain HTTP or by automating web browsers and is deployed with
5
- Redis and a message queue (which can be Redis-based itself).
6
- In development, it can execute fully in memory, without Redis.
4
+ It works with plain HTTP and by automating web browsers interchangeably
5
+ and is deployed with Redis and a message queue.
6
+ During development it can execute fully in memory, without Redis.
7
7
 
8
- You need a compatible version of Ruby installed.
8
+ ## Getting started
9
9
 
10
- To get started, in an empty directory, generate a new `Gemfile` and install
11
- ActiveJob and Wayfarer:
10
+ In an empty directory, generate a new `Gemfile` and install Wayfarer:
12
11
 
13
12
  ```sh
14
13
  bundle init
@@ -22,14 +21,16 @@ Wayfarer builds on Active Job, the message queue abstraction of Rails.
22
21
  You can use Wayfarer without Rails of course, as we do here.
23
22
 
24
23
  A message queue supports two operations: appending messages to the end and consuming
25
- messages from the front. In the case of Wayfarer, messages are tasks, a string pair
26
- consisting of a URL and a batch. When a task is consumed, it is processed by a job,
27
- a Ruby class.
24
+ messages from the front. This is how Wayfarer processes tasks, a string pair
25
+ of URL and batch. Wayfarer enforces that URLs are not processed more than
26
+ once within their batch (excluding retries).
28
27
 
29
- Let's give ourselves a `dummy_job.rb` that routes arbitrary URLs to its
28
+ When a task is consumed, it is processed by a job, a Ruby class.
29
+
30
+ Let's give ourselves a `dummy_job.rb` that routes all URLs to its
30
31
  `index` instance method, where we print the current `task`:
31
32
 
32
- ```ruby
33
+ ```ruby title="dummy_job.rb"
33
34
  require "activejob"
34
35
  require "wayfarer"
35
36
 
@@ -44,17 +45,22 @@ class DummyJob < ActiveJob::Base
44
45
  end
45
46
  ```
46
47
 
47
- We can perform our job from the command line with Wayfarer's CLI and find
48
- that in between ActiveJob's log output, our task was printed with a generated
49
- UUID for its batch:
48
+ We can perform our job from the command line with the `wayfarer perform`
49
+ subcommand. In between ActiveJob's log output, we see that Wayfarer
50
+ has generated a UUID for the batch since we did not pass it:
50
51
 
51
- ```hl_lines="1 3"
52
+ ```sh
52
53
  bundle exec wayfarer perform -r dummy_job.rb DummyJob https://example.com
54
+ ```
55
+
56
+ ```hl_lines="2"
53
57
  [ActiveJob] [DummyJob] [68853491-...] Performing DummyJob (Job ID: 68853491-...) from Async(default) with arguments: #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
54
58
  #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
55
59
  [ActiveJob] [DummyJob] [68853491-...] Performed DummyJob (Job ID: 68853491-) from Async(default) in 507.65ms
56
60
  ```
57
61
 
58
- Many commands accept a `--batch` flag for setting the batch. If you don't
59
- provide one, a UUID is generated.
62
+ If you don't provide a batch, Wayfarer uses a generated UUID instead.
63
+ We could have also used `DummyJob.crawl
64
+
65
+
60
66
 
@@ -1,16 +1,18 @@
1
1
  # User agents
2
2
 
3
3
  User agents are used by [jobs](../jobs) to retrieve the contents behind a URL into a
4
- [page](../pages). They are kept in a connection pool and all user agents in the pool
5
- share the same type and configuration. You can add custom user agents by implementing
4
+ [page](../pages), for example a remotely controlled Firefox process or a Ruby HTTP client.
5
+
6
+ User agents are kept in a connection pool and all user agents in the pool
7
+ share the same type and configuration. You can add your own custom user agents by implementing
6
8
  the [user agent API](custom_user_agents.md).
7
9
 
8
10
  Wayfarer comes with the following built-in user agents:
9
11
 
10
- * [`#!ruby :http`](http.md) (default)
11
- * [`#!ruby :ferrum`](ferrum.md) to automate Google Chrome
12
- * [`#!ruby :selenium`](selenium.md) to automate a variety of browsers
13
- * [`#!ruby :capybara`](capybara.md) to use Capybara sessions
12
+ * [`:http`](http.md) (default)
13
+ * [`:ferrum`](ferrum.md) to automate Google Chrome
14
+ * [`:selenium`](selenium.md) to automate a variety of browsers
15
+ * [`:capybara`](capybara.md) to use Capybara sessions
14
16
 
15
17
  Configure the user agent with the global configuration option:
16
18
 
@@ -53,7 +55,7 @@ class DummyJob < ActiveJob::Base
53
55
  end
54
56
  ```
55
57
 
56
- !!! info "`#fetch` uses the configured `Wayfarer.config.network.http_headers`."
58
+ !!! info "`#fetch` respects `Wayfarer.config.network.http_headers` for all provided user agents."
57
59
 
58
60
  ## HTTP request headers
59
61
 
@@ -78,7 +80,7 @@ underlying message queue operates with. For example, if you use Sidekiq,
78
80
  you should set the pool size to the number of Sidekiq threads:
79
81
 
80
82
  ```ruby
81
- Wayfarer.config[:network][:pool_size] = Sidekiq.options[:concurrency]
83
+ Wayfarer.config[:network][:pool][:size] = Sidekiq.options[:concurrency]
82
84
  ```
83
85
 
84
86
  !!! attention "The connection pool size is 1 by default"
@@ -109,5 +111,5 @@ You can configure the timeout, although you will likely want to increase the
109
111
  pool size instead:
110
112
 
111
113
  ```ruby
112
- Wayfarer.config[:network][:pool_timeout] = 10 # seconds
114
+ Wayfarer.config[:network][:pool][:timeout] = 10 # seconds
113
115
  ```
data/lib/wayfarer/base.rb CHANGED
@@ -4,15 +4,16 @@ module Wayfarer
4
4
  # @!attribute [r] task
5
5
  # @return [Wayfarer::Task] the current task
6
6
  # @!attribute [r] uri
7
- # @return [Addressable::URI] Parsed task URL
7
+ # @return [Addressable::URI] parsed task URL
8
8
  # @!attribute [r] user_agent
9
- # @return [Object] the user agent that retrieved the page
9
+ # @return [Object] the user agent used to retrieve the page
10
10
  # @!attribute [r] action
11
- # @return [Symbol, Object] action that the task URL was routed to.
11
+ # @return [Symbol, Handler] action that the task URL was routed to
12
12
  # @!attribute [r] params
13
13
  # @return [HashWithIndifferentAccess] path parameters collected from routes
14
14
  module Base
15
15
  extend ActiveSupport::Concern
16
+
16
17
  # @!method stage(urls)
17
18
  # Adds URLs to an internal staging set so that they get enqueued
18
19
  # eventually, once the job executed successfully.
@@ -29,8 +30,8 @@ module Wayfarer
29
30
  # @!method page(live: false)
30
31
  # @param url [live] whether to retrieve a new {Page}.
31
32
  # @return [Wayfarer::Page]
32
- # Returns the most recently retrieved page or a new page
33
- # for the current task URL if the `follow` keyword is passed.
33
+ # The most recently retrieved page or a new page for the current task URL if
34
+ # the `live` keyword is passed.
34
35
 
35
36
  # @!scope class
36
37
 
@@ -111,15 +112,15 @@ module Wayfarer
111
112
  included do
112
113
  include Wayfarer::Middleware::Controller
113
114
 
114
- # Implement ActiveJob's #perform by calling into our own middleware chain
115
+ # Implement ActiveJob's #perform by calling into our own middleware
116
+ # chain included from {Controller}
115
117
  alias_method :perform, :call
116
118
 
117
- # Middleware stack
118
119
  use Wayfarer::Middleware::Redis
119
- use Wayfarer::Middleware::BatchCompletion
120
120
  use Wayfarer::Middleware::UriParser
121
121
  use Wayfarer::Middleware::Normalize
122
122
  use Wayfarer::Middleware::Dedup
123
+ use Wayfarer::Middleware::BatchCompletion
123
124
  use Wayfarer::Middleware::Stage
124
125
  use Wayfarer::Middleware::Router
125
126
  use Wayfarer::Middleware::UserAgent
@@ -14,10 +14,10 @@ module Wayfarer
14
14
  module BatchCompletion
15
15
  module_function
16
16
 
17
+ EVENTS = %w[enqueue.active_job perform.active_job retry_stopped.active_job].freeze
18
+
17
19
  def subscribe!
18
- ActiveSupport::Notifications.subscribe("enqueue.active_job", self)
19
- ActiveSupport::Notifications.subscribe("perform.active_job", self)
20
- ActiveSupport::Notifications.subscribe("retry_stopped.active_job", self)
20
+ EVENTS.each { |event| ActiveSupport::Notifications.subscribe(event, self) }
21
21
  end
22
22
 
23
23
  def call(name, _, _, _, data)
@@ -26,25 +26,29 @@ module Wayfarer
26
26
  task = job.arguments.first
27
27
 
28
28
  # In the case of `enqueue.active_job` middleware hasn't executed yet
29
- task[:redis_pool] ||= Wayfarer::Redis::Pool.instance # TODO: Test
30
-
31
- counter = Redis::Counter.new(task) do
32
- job.run_callbacks(:batch)
33
- ensure
34
- Wayfarer::GC.run(task)
35
- end
29
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
36
30
 
37
- handle(name, job, task, counter)
31
+ handle(name, job, task)
38
32
  end
39
33
 
40
- def handle(name, job, task, counter)
34
+ def handle(name, job, task)
35
+ counter = Wayfarer::Redis::Counter.new(task)
36
+
41
37
  case name
42
38
  when "enqueue.active_job" then counter.increment unless retry?(job)
43
- when "perform.active_job" then counter.decrement if succeeded?(job, task)
44
- when "retry_stopped.active_job" then counter.decrement
39
+ when "perform.active_job" then succeed!(task, counter) if succeeded?(job, task)
40
+ when "retry_stopped.active_job" then fail!(counter)
45
41
  end
46
42
  end
47
43
 
44
+ def succeed!(task, counter)
45
+ Wayfarer::GC.run(task) if counter.decrement == 0
46
+ end
47
+
48
+ def fail!(counter)
49
+ counter.decrement
50
+ end
51
+
48
52
  def retry?(job)
49
53
  job.executions > 0
50
54
  end
@@ -13,32 +13,32 @@ module Wayfarer
13
13
  end
14
14
 
15
15
  class_methods do
16
- def before_fetch(*filters, &block)
17
- set_callback(:fetch, :before, *filters, &block)
16
+ def before_fetch(...)
17
+ set_callback(:fetch, :before, ...)
18
18
  end
19
19
 
20
- def around_fetch(*filters, &block)
21
- set_callback(:fetch, :around, *filters, &block)
20
+ def around_fetch(...)
21
+ set_callback(:fetch, :around, ...)
22
22
  end
23
23
 
24
- def after_fetch(*filters, &block)
25
- set_callback(:fetch, :after, *filters, &block)
24
+ def after_fetch(...)
25
+ set_callback(:fetch, :after, ...)
26
26
  end
27
27
 
28
- def before_action(*filters, &block)
29
- set_callback(:action, :before, *filters, &block)
28
+ def before_action(...)
29
+ set_callback(:action, :before, ...)
30
30
  end
31
31
 
32
- def around_action(*filters, &block)
33
- set_callback(:action, :around, *filters, &block)
32
+ def around_action(...)
33
+ set_callback(:action, :around, ...)
34
34
  end
35
35
 
36
- def after_action(*filters, &block)
37
- set_callback(:action, :after, *filters, &block)
36
+ def after_action(...)
37
+ set_callback(:action, :after, ...)
38
38
  end
39
39
 
40
- def after_batch(*filters, &block)
41
- set_callback(:batch, :after, *filters, &block)
40
+ def after_batch(...)
41
+ set_callback(:batch, :after, ...)
42
42
  end
43
43
  end
44
44
  end
@@ -2,128 +2,110 @@
2
2
 
3
3
  module Wayfarer
4
4
  class CLI
5
- class RoutePrinter < Thor::Shell::Color
6
- attr_reader :url, :path_finder, :output
5
+ # Turns a routing tree into a Hash and prints it.
6
+ # Used by the `route` CLI subcommand.
7
+ #
8
+ # @api private
9
+ class RoutePrinter
10
+ # @return [Hash<Symbol, Proc>]
11
+ class_attribute :serializers,
12
+ default: { yaml: ->(hash) { YAML.dump(hash.deep_stringify_keys) },
13
+ json: ->(hash) { JSON.pretty_generate(hash) },
14
+ ruby: ->(hash) { pp(hash) } },
15
+ instance_accessor: false,
16
+ instance_predicate: false
7
17
 
8
- INDENT = " "
9
- REGULAR_SEGMENT = "│ "
10
- JUNCTION_SEGMENT = "├──"
11
- CORNER_SEGMENT = "└──"
18
+ BATCH = "tmp"
12
19
 
13
- def self.print(route, url)
14
- route.accept(new(url))
20
+ # Prints a routing tree.
21
+ #
22
+ # @param route [Wayfarer::Routing::Route] route to print
23
+ # @param url [String] URL to match
24
+ # @param format [String, Symbol] `:json`, `:yaml` or `:ruby`
25
+ def self.print(route, url, format:)
26
+ new(route, url, serializers.fetch(format.to_sym)).print
15
27
  end
16
28
 
17
- def initialize(url)
18
- @url = url
19
- @path_finder = Wayfarer::Routing::PathFinder.new(url)
20
- super()
21
- end
22
-
23
- def visit(route)
24
- route.accept(path_finder) unless route.parent
25
- puts format_route_output(route)
26
- true
27
- end
28
-
29
- private
29
+ # @param route [Wayfarer::Routing::Route] route to print
30
+ # @param url [String] URL to match
31
+ # @param serializer [Proc<Hash=>String>] output serializer
32
+ def initialize(route, url, serializer)
33
+ @route = route
34
+ @serializer = serializer
30
35
 
31
- def format_route_output(route)
32
- [segments(route), route_description(route)].join[3..]
33
- end
36
+ @nodes = {}
37
+ @root_hash = nil
34
38
 
35
- def segments(route)
36
- [parents(route).map { |parent| parent_segment(parent) }, segment(route)].join
39
+ task = Wayfarer::Task.new(url, BATCH)
40
+ task[:uri] = Addressable::URI.parse(url)
41
+ @path_finder = Wayfarer::Routing::PathFinder.new(
42
+ task,
43
+ stop_when_found: false,
44
+ &method(:call)
45
+ )
37
46
  end
38
47
 
39
- def parent_segment(parent)
40
- trailer?(parent) ? INDENT : REGULAR_SEGMENT
41
- end
48
+ # Processes the routing trees and prints the serialized output.
49
+ def print
50
+ route.accept(path_finder)
42
51
 
43
- def segment(route)
44
- trailer?(route) ? CORNER_SEGMENT : JUNCTION_SEGMENT
45
- end
52
+ hash = routing_result(path_finder).merge(root_hash)
46
53
 
47
- def route_description(route)
48
- attrs = [route_arg(route), routing_result(route), route_action(route), route_params(route)].compact
49
- text = attrs.any? ? "#{matcher_name(route)}(#{attrs.join(', ')})" : matcher_name(route)
50
- set_color(text, *route_colors(route))
54
+ puts serializer.call(hash)
51
55
  end
52
56
 
53
- def matcher_name(route)
54
- case route
55
- when Wayfarer::Routing::TargetRoute
56
- "Target"
57
- when Wayfarer::Routing::RootRoute
58
- Wayfarer::Routing::PathFinder.result(route, url).class.name.demodulize
59
- else
60
- route.matcher.class.name.demodulize
61
- end
62
- end
57
+ # Callback method called by `path_finder` with the result of matching
58
+ # the route.
59
+ #
60
+ # @param route [Wayfarer::Routing::Route] the current route
61
+ # @param result [true, false] routing result
62
+ # @param path_finder [Wayfarer::Routing::PathFinder] the path finder
63
+ def call(route, result, path_finder)
64
+ node = (nodes[route] ||= attributes(route, result, path_finder))
65
+ parent = route.parent
63
66
 
64
- def routing_result(route)
65
- return if route.is_a?(Wayfarer::Routing::RootRoute)
67
+ return @root_hash ||= node unless parent
66
68
 
67
- "match: #{route.matcher.match(url)}"
69
+ nodes.dig(parent, route_type(parent), :children).append(node)
68
70
  end
69
71
 
70
- def route_action(route)
71
- return unless route.is_a?(Wayfarer::Routing::RootRoute)
72
-
73
- result = Wayfarer::Routing::PathFinder.result(route, url)
74
- result.action.inspect if result.is_a?(Wayfarer::Routing::Result::Match)
75
- end
72
+ private
76
73
 
77
- def route_arg(route)
78
- return if route.is_a?(Wayfarer::Routing::RootRoute) || route.is_a?(Wayfarer::Routing::TargetRoute)
79
-
80
- matcher = route.matcher
81
- matcher_opts = case matcher
82
- when Wayfarer::Routing::Matchers::Host then matcher.host
83
- when Wayfarer::Routing::Matchers::Path then matcher.path
84
- when Wayfarer::Routing::Matchers::Query then matcher.fields
85
- when Wayfarer::Routing::Matchers::Custom then route.action.to_s
86
- when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
87
- when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
88
- end
89
- matcher_opts.inspect
90
- end
74
+ attr_reader :route,
75
+ :path_finder,
76
+ :serializer,
77
+ :nodes,
78
+ :root_hash
91
79
 
92
- def route_params(route)
93
- params = if route.is_a?(Wayfarer::Routing::RootRoute)
94
- result = Wayfarer::Routing::PathFinder.result(route, url)
95
- result.params if result.is_a?(Wayfarer::Routing::Result::Match)
96
- else
97
- route.matcher.params(url)
98
- end
80
+ def routing_result(path_finder)
81
+ return { routed: false } unless path_finder.found?
99
82
 
100
- "params: #{params.symbolize_keys}" if params&.any?
83
+ action = path_finder.action
84
+ { routed: true,
85
+ params: path_finder.params,
86
+ action: case action
87
+ when Array
88
+ { handler: action.first.class.name, action: action.second }
89
+ else action
90
+ end }
101
91
  end
102
92
 
103
- def parents(route, current = [])
104
- return current unless route.parent
105
-
106
- parents(route.parent, [route.parent, *current])
93
+ def attributes(route, result, path_finder)
94
+ { route_type(route) => route.to_h.merge!(
95
+ route_result(route, result, path_finder),
96
+ children: []
97
+ ) }
107
98
  end
108
99
 
109
- def trailer?(route)
110
- !route.parent || route.parent.children.last == route
111
- end
100
+ def route_result(route, result, path_finder)
101
+ return {} if route.target?
112
102
 
113
- def route_colors(route)
114
- if path_finder.path.include?(route)
115
- %i[green bold]
116
- elsif route.matcher.match(url)
117
- %i[green]
118
- else
119
- %i[red]
120
- end
103
+ { match: result,
104
+ params: path_finder.params_stack.to_h }
121
105
  end
122
106
 
123
- def set_color(string, *colors)
124
- return string if ENV.key?("NO_COLOR")
125
-
126
- super(string, *colors)
107
+ def route_type(route)
108
+ route.class.name.demodulize.underscore
127
109
  end
128
110
  end
129
111
  end
data/lib/wayfarer/cli.rb CHANGED
@@ -13,36 +13,16 @@ module Wayfarer
13
13
 
14
14
  class_option :require, aliases: :r, type: :string, default: nil
15
15
 
16
- desc "route JOB URL", "Routing result of URL for JOB"
16
+ desc "route JOB URL", "Routing tree for URL for JOB"
17
+ option :format, type: :string, enum: %w[yaml json ruby], default: "yaml"
17
18
  def route(job, url)
18
19
  load_environment
19
20
 
20
21
  url = parsed_url(url)
21
- job = job.classify.constantize
22
+ job = job.camelize.constantize
22
23
  route = job.route
23
- route.invoke(url)
24
24
 
25
- result = Wayfarer::Routing::PathFinder.result(route, url)
26
- result_type = result.class.name.demodulize
27
-
28
- say case result
29
- when Wayfarer::Routing::Result::Match
30
- "#{result_type} => #{result.action.inspect}"
31
- else
32
- result_type
33
- end
34
- end
35
-
36
- desc "tree JOB URL", "Visualize JOB's routing tree for URL"
37
- def tree(job, url)
38
- load_environment
39
-
40
- url = parsed_url(url)
41
- job = job.classify.constantize
42
- route = job.route
43
- route.invoke(url)
44
-
45
- Wayfarer::CLI::RoutePrinter.print(route, url)
25
+ Wayfarer::CLI::RoutePrinter.print(route, url, format: options.fetch("format"))
46
26
  end
47
27
 
48
28
  desc "perform JOB URL", "Perform JOB with URL"
@@ -52,8 +32,8 @@ module Wayfarer
52
32
  load_environment
53
33
  mock_redis
54
34
 
55
- job = job.classify.constantize
56
- task = Wayfarer::Task.new(url, options[:batch])
35
+ job = job.camelize.constantize
36
+ task = Wayfarer::Task.new(url, options.fetch(:batch))
57
37
  job.new(task).perform_now
58
38
  end
59
39
 
@@ -62,7 +42,7 @@ module Wayfarer
62
42
  def enqueue(job, url)
63
43
  load_environment
64
44
 
65
- job = job.classify.constantize
45
+ job = job.camelize.constantize
66
46
  job.crawl(url, batch: options[:batch])
67
47
  end
68
48
 
@@ -71,21 +51,23 @@ module Wayfarer
71
51
  option :batch, type: :string, default: SecureRandom.uuid
72
52
  option :min_threads, type: :numeric, default: 1
73
53
  option :max_threads, type: :numeric, default: 1
54
+ option :retain_pool, type: :boolean, default: false
74
55
  def execute(job, url)
75
56
  load_environment
76
57
  mock_redis
77
58
 
78
- job = job.classify.constantize
59
+ job = job.camelize.constantize
79
60
  job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
80
61
  max_threads: options[:max_threads])
81
62
  scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
82
63
  executor = scheduler.instance_variable_get(:@async_executor)
83
64
 
84
- job.crawl(url, batch: options[:batch])
65
+ job.crawl(url, batch: options.fetch(:batch))
85
66
 
86
67
  sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
87
68
 
88
- Wayfarer::Networking::Pool.instance.free
69
+ # Used in test suite to avoid pool recreation
70
+ Wayfarer::Networking::Pool.instance.free unless options.fetch(:retain_pool)
89
71
  end
90
72
 
91
73
  private
data/lib/wayfarer/gc.rb CHANGED
@@ -6,12 +6,17 @@ module Wayfarer
6
6
  RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
7
7
 
8
8
  class << self
9
- include Wayfarer::Logging.emit(gc: [:info, "Garbage collecting %<resettable>s"])
9
+ include Wayfarer::Logging.emit(
10
+ after_batch: [:debug, "Running `after_batch` callback"],
11
+ gc: [:debug, "Garbage collecting %<resettable>s"]
12
+ )
10
13
  end
11
14
 
12
15
  module_function
13
16
 
14
17
  def run(task)
18
+ task[:job].run_callbacks(:batch)
19
+
15
20
  RESETTABLES.each do |resettable|
16
21
  log(:gc, task, resettable: resettable)
17
22
  resettable.new(task).reset!
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # Provides a key-value store via `[]` and `[]=`.
5
+ #
6
+ # @api private
7
+ module KV
8
+ # @param key [Object] key to fetch
9
+ # @return [Object, nil] value associated with the key or `nil`
10
+ def [](key)
11
+ kv[key]
12
+ end
13
+
14
+ # @param key [Object] key to set
15
+ # @param value [Object] value to set
16
+ # @return [Object] value that was set
17
+ def []=(key, value)
18
+ kv[key] = value
19
+ end
20
+
21
+ private
22
+
23
+ # @return [Hash<Object, Object>]
24
+ def kv
25
+ @kv ||= {}
26
+ end
27
+ end
28
+ end
@@ -2,9 +2,15 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Middleware
5
- Chain = Struct.new(:middlewares) do
5
+ class Chain
6
6
  extend Forwardable
7
7
 
8
+ attr_reader :middlewares
9
+
10
+ def initialize(middlewares)
11
+ @middlewares = middlewares
12
+ end
13
+
8
14
  def self.empty
9
15
  new([])
10
16
  end