wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
data/lib/wayfarer/base.rb CHANGED
@@ -1,60 +1,139 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- class Base < ActiveJob::Base
5
- include Wayfarer::Middleware::Controller
6
-
7
- use Wayfarer::Middleware::Stage
8
- use Wayfarer::Middleware::Dedup
9
- use Wayfarer::Middleware::Normalize
10
- use Wayfarer::Middleware::Router
11
- use Wayfarer::Middleware::Fetch
12
- use Wayfarer::Middleware::Dispatch
13
-
14
- ErrorHandler = lambda do |&block|
15
- lambda do |job, error|
16
- task = job.arguments.first
17
- task.barrier.seen?(task.url)
18
- task.gc.run
19
- block.call(job, error)
20
- end
21
- end
4
+ # @!attribute [r] task
5
+ # @return [Wayfarer::Task] the current task
6
+ # @!attribute [r] uri
7
+ # @return [Addressable::URI] parsed task URL
8
+ # @!attribute [r] user_agent
9
+ # @return [Object] the user agent used to retrieve the page
10
+ # @!attribute [r] action
11
+ # @return [Symbol, Handler] action that the task URL was routed to
12
+ # @!attribute [r] params
13
+ # @return [HashWithIndifferentAccess] path parameters collected from routes
14
+ module Base
15
+ extend ActiveSupport::Concern
22
16
 
23
- after_enqueue do |job|
24
- task = job.arguments.first
25
- task.counter.increment
26
- end
17
+ # @!method stage(urls)
18
+ # Adds URLs to an internal staging set so that they get enqueued
19
+ # eventually, once the job executed successfully.
20
+ # @overload stage(urls)
21
+ # @param urls [Array<String>] URLs to add to the staging set.
22
+ # @overload stage(url)
23
+ # @param url [String] URL to add to the staging set.
27
24
 
28
- after_perform do |job|
29
- task = job.arguments.first
30
- task.gc.run
31
- end
25
+ # @!method fetch(url, follow: 3)
26
+ # @param url [String] URL to fetch using plain HTTP(S).
27
+ # @param follow [Fixnum] Number of redirects to follow.
28
+ # Retrieves the given URL to a {Page}.
32
29
 
33
- rescue_from(StandardError) do
34
- task = arguments.first
35
- task.gc.run
36
- end
30
+ # @!method page(live: false)
31
+ # @param url [live] whether to retrieve a new {Page}.
32
+ # @return [Wayfarer::Page]
33
+ # The most recently retrieved page or a new page for the current task URL if
34
+ # the `live` keyword is passed.
37
35
 
38
- def self.retry_on(*argv, &block)
39
- super(*argv, &ErrorHandler.call(&block))
40
- end
36
+ # @!scope class
41
37
 
42
- def self.discard_on(*argv, &block)
43
- super(*argv, &ErrorHandler.call(&block))
44
- end
38
+ # @!attribute [r] route
39
+ # @return [Wayfarer::Routing::DSL]
40
+ # The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
41
+ # or to a {Handler}.
42
+ # @example Append a host route
43
+ # route.host "examplxe.com", to: :index
45
44
 
46
- def self.crawl(url, batch: SecureRandom.uuid)
47
- Task.new(url, batch).tap do |task|
48
- perform_later(task)
49
- end
50
- end
45
+ # @!method content_types(*content_types)
46
+ # @param content_types [*Array<String, Regexp>] Content-Types to whitelist
47
+ # Whitelists Content-Types. Once at least one Content-Type is set, only
48
+ # those Content-Types will be processed.
49
+
50
+ # @!group Callbacks
51
+
52
+ # @!method before_fetch
53
+ # @overload before_fetch(callback)
54
+ # @param callback [Symbol] Instance method to call
55
+ # @overload before_fetch(&block)
56
+ # @yield [Wayfarer::Task]
57
+ # Registers a callback that is called before the page is fetched.
58
+ # If a symbol is passed, an instance method with the same name will be
59
+ # called.
60
+ # @example Accessing the user agent in {#before_fetch}
61
+ # before_fetch do |task|
62
+ # user_agent # => the user agent that will fetch the page
63
+ # end
51
64
 
52
- def retry_job(...)
53
- super(...) # increments the counter by re-enqueuing the job
54
- task = arguments.first
55
- task.counter.decrement
65
+ # @!method around_fetch
66
+ # @overload around_fetch(callback)
67
+ # @param callback [Symbol] Instance method to call
68
+ # @overload around_fetch(&block)
69
+ # @yield [Wayfarer::Task]
70
+ # Registers a callback that is called around the page getting fetched.
71
+ # If a symbol is passed, an instance method with the same name will be
72
+ # called.
73
+
74
+ # @!method after_fetch
75
+ # @overload after_fetch(callback)
76
+ # @param callback [Symbol] Instance method to call
77
+ # @overload after_fetch(&block)
78
+ # @yield [Wayfarer::Task]
79
+ # Registers a callback that is called after the page was fetched.
80
+ # If a symbol is passed, an instance method with the same name will be
81
+ # called.
82
+
83
+ # @!method before_perform
84
+ # @overload before_perform(callback)
85
+ # @param callback [Symbol] Instance method to call
86
+ # @overload before_perform(&block)
87
+ # @yield [Wayfarer::Task]
88
+ # Registers a callback that is called before the task is performed.
89
+ # If a symbol is passed, an instance method with the same name will be
90
+ # called.
91
+
92
+ # @!method around_perform
93
+ # @overload around_perform(callback)
94
+ # @param callback [Symbol] Instance method to call
95
+ # @overload around_perform(&block)
96
+ # @yield [Wayfarer::Task]
97
+ # Registers a callback that is called around the task getting performed.
98
+ # If a symbol is passed, an instance method with the same name will be
99
+ # called.
100
+
101
+ # @!method after_perform
102
+ # @overload after_perform(callback)
103
+ # @param callback [Symbol] Instance method to call
104
+ # @overload after_perform(&block)
105
+ # @yield [Wayfarer::Task]
106
+ # Registers a callback that is called after the task was performed.
107
+ # If a symbol is passed, an instance method with the same name will be
108
+ # called.
109
+
110
+ # @!endgroup
111
+
112
+ included do
113
+ include Wayfarer::Middleware::Controller
114
+
115
+ # Implement ActiveJob's #perform by calling into our own middleware
116
+ # chain included from {Controller}
117
+ alias_method :perform, :call
118
+
119
+ use Wayfarer::Middleware::Redis
120
+ use Wayfarer::Middleware::UriParser
121
+ use Wayfarer::Middleware::Normalize
122
+ use Wayfarer::Middleware::Dedup
123
+ use Wayfarer::Middleware::BatchCompletion
124
+ use Wayfarer::Middleware::Stage
125
+ use Wayfarer::Middleware::Router
126
+ use Wayfarer::Middleware::UserAgent
127
+ use Wayfarer::Middleware::ContentType
128
+ use Wayfarer::Middleware::Dispatch
56
129
  end
57
130
 
58
- alias perform call
131
+ class_methods do
132
+ def crawl(url, batch: SecureRandom.uuid)
133
+ Task.new(url, batch).tap do |task|
134
+ perform_later(task)
135
+ end
136
+ end
137
+ end
59
138
  end
60
139
  end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # BatchCompletion tracks the completion of a batch of jobs.
5
+ # It does so by incrementing and decrementing a counter in Redis.
6
+ #
7
+ # The counter is incremented when a job is first enqueued and decremented when
8
+ # a job is performed. If a job is retried, the counter is not incremented.
9
+ # When a job succeeds or fails and thereby exceeds its retry count, the counter
10
+ # is decremented.
11
+ #
12
+ # When the counter reaches zero, garbage collection deletes the Redis keys
13
+ # associated with the batch.
14
+ module BatchCompletion
15
+ module_function
16
+
17
+ EVENTS = %w[enqueue.active_job perform.active_job retry_stopped.active_job].freeze
18
+
19
+ def subscribe!
20
+ EVENTS.each { |event| ActiveSupport::Notifications.subscribe(event, self) }
21
+ end
22
+
23
+ def call(name, _, _, _, data)
24
+ return unless (job = data[:job]).is_a?(Wayfarer::Base)
25
+
26
+ task = job.arguments.first
27
+
28
+ # In the case of `enqueue.active_job` middleware hasn't executed yet
29
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
30
+
31
+ handle(name, job, task)
32
+ end
33
+
34
+ def handle(name, job, task)
35
+ counter = Wayfarer::Redis::Counter.new(task)
36
+
37
+ case name
38
+ when "enqueue.active_job" then counter.increment unless retry?(job)
39
+ when "perform.active_job" then succeed!(task, counter) if succeeded?(job, task)
40
+ when "retry_stopped.active_job" then fail!(counter)
41
+ end
42
+ end
43
+
44
+ def succeed!(task, counter)
45
+ Wayfarer::GC.run(task) if counter.decrement == 0
46
+ end
47
+
48
+ def fail!(counter)
49
+ counter.decrement
50
+ end
51
+
52
+ def retry?(job)
53
+ job.executions > 0
54
+ end
55
+
56
+ def succeeded?(job, task)
57
+ job.exception_executions == task[:initial_exception_executions]
58
+ end
59
+ end
60
+ end
@@ -2,69 +2,43 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Callbacks
5
- TERMINATOR = ->(_target, result) { result.call == false }
6
- OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
5
+ extend ActiveSupport::Concern
7
6
 
8
- ConditionalCallback = Struct.new(:job, :filters) do
9
- def run(method, &block)
10
- return if only && !applies?(only)
11
- return if except && applies?(except)
7
+ included do
8
+ include ActiveSupport::Callbacks
12
9
 
13
- return job.send(method) if method
10
+ define_callbacks :fetch, skip_after_callbacks_if_terminated: true
11
+ define_callbacks :action, skip_after_callbacks_if_terminated: true
12
+ define_callbacks :batch
13
+ end
14
14
 
15
- job.instance_eval(&block)
15
+ class_methods do
16
+ def before_fetch(...)
17
+ set_callback(:fetch, :before, ...)
16
18
  end
17
19
 
18
- private
19
-
20
- def applies?(condition)
21
- case condition
22
- when Symbol then condition == action
23
- when Enumerable then condition&.include?(action)
24
- end
20
+ def around_fetch(...)
21
+ set_callback(:fetch, :around, ...)
25
22
  end
26
23
 
27
- def only
28
- filters[:only]
24
+ def after_fetch(...)
25
+ set_callback(:fetch, :after, ...)
29
26
  end
30
27
 
31
- def except
32
- filters[:except]
28
+ def before_action(...)
29
+ set_callback(:action, :before, ...)
33
30
  end
34
31
 
35
- def action
36
- task.metadata.action
32
+ def around_action(...)
33
+ set_callback(:action, :around, ...)
37
34
  end
38
35
 
39
- def task
40
- job.task
36
+ def after_action(...)
37
+ set_callback(:action, :after, ...)
41
38
  end
42
- end
43
-
44
- def self.included(base)
45
- base.include(ActiveSupport::Callbacks)
46
- base.extend(ClassMethods)
47
-
48
- base.class_eval do
49
- define_callbacks(:fetch, OPTIONS)
50
- define_callbacks(:action, OPTIONS)
51
- define_callbacks(:batch, OPTIONS)
52
-
53
- define(:fetch, :before)
54
- define(:action, :before)
55
- define(:batch, :after)
56
- end
57
- end
58
-
59
- module ClassMethods
60
- private
61
39
 
62
- def define(name, stage)
63
- define_singleton_method([stage, name].join("_")) do |method = nil, **filters, &block|
64
- set_callback(name, stage, **filters) do |job|
65
- ConditionalCallback.new(job, filters).run(method, &block)
66
- end
67
- end
40
+ def after_batch(...)
41
+ set_callback(:batch, :after, ...)
68
42
  end
69
43
  end
70
44
  end
@@ -1,115 +1,111 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- module CLI
5
- class RoutePrinter < Thor::Shell::Color
6
- attr_reader :url,
7
- :path_finder,
8
- :output
9
-
10
- INDENT = " "
11
- REGULAR_SEGMENT = "│ "
12
- JUNCTION_SEGMENT = "├──"
13
- CORNER_SEGMENT = "└──"
14
-
15
- def self.print(route, url)
16
- route.accept(new(url))
4
+ class CLI
5
+ # Turns a routing tree into a Hash and prints it.
6
+ # Used by the `route` CLI subcommand.
7
+ #
8
+ # @api private
9
+ class RoutePrinter
10
+ # @return [Hash<Symbol, Proc>]
11
+ class_attribute :serializers,
12
+ default: { yaml: ->(hash) { YAML.dump(hash.deep_stringify_keys) },
13
+ json: ->(hash) { JSON.pretty_generate(hash) },
14
+ ruby: ->(hash) { pp(hash) } },
15
+ instance_accessor: false,
16
+ instance_predicate: false
17
+
18
+ BATCH = "tmp"
19
+
20
+ # Prints a routing tree.
21
+ #
22
+ # @param route [Wayfarer::Routing::Route] route to print
23
+ # @param url [String] URL to match
24
+ # @param format [String, Symbol] `:json`, `:yaml` or `:ruby`
25
+ def self.print(route, url, format:)
26
+ new(route, url, serializers.fetch(format.to_sym)).print
17
27
  end
18
28
 
19
- def initialize(url)
20
- @url = url
21
- @path_finder = Wayfarer::Routing::PathFinder.new(url)
22
- super()
29
+ # @param route [Wayfarer::Routing::Route] route to print
30
+ # @param url [String] URL to match
31
+ # @param serializer [Proc<Hash=>String>] output serializer
32
+ def initialize(route, url, serializer)
33
+ @route = route
34
+ @serializer = serializer
35
+
36
+ @nodes = {}
37
+ @root_hash = nil
38
+
39
+ task = Wayfarer::Task.new(url, BATCH)
40
+ task[:uri] = Addressable::URI.parse(url)
41
+ @path_finder = Wayfarer::Routing::PathFinder.new(
42
+ task,
43
+ stop_when_found: false,
44
+ &method(:call)
45
+ )
23
46
  end
24
47
 
25
- def visit(route)
26
- route.accept(path_finder) unless route.parent
27
- return true if route.is_a?(Wayfarer::Routing::RootRoute)
28
-
29
- puts [segments(route), label(route)].join("")[3..]
30
- true
31
- end
32
-
33
- def segments(route)
34
- current = segment(route)
35
- parents = parents(route).map { |parent| parent_segment(parent) }
36
- [parents, current].join
37
- end
38
-
39
- def parent_segment(parent)
40
- if trailer?(parent)
41
- INDENT
42
- else
43
- REGULAR_SEGMENT
44
- end
45
- end
48
+ # Processes the routing trees and prints the serialized output.
49
+ def print
50
+ route.accept(path_finder)
46
51
 
47
- def segment(route)
48
- if trailer?(route)
49
- CORNER_SEGMENT
50
- else
51
- JUNCTION_SEGMENT
52
- end
53
- end
52
+ hash = routing_result(path_finder).merge(root_hash)
54
53
 
55
- def label(route)
56
- [highlight_matcher(route, matcher_label(route)),
57
- highlight_options(route, options(route)),
58
- highlight_options(route, params(route))].compact.join(" ")
54
+ puts serializer.call(hash)
59
55
  end
60
56
 
61
- def highlight_matcher(route, string)
62
- if path_finder.path.include?(route)
63
- set_color(string, :green, :bold)
64
- elsif route.matcher.match(url)
65
- set_color(string, :green)
66
- else
67
- set_color(string, :red)
68
- end
69
- end
57
+ # Callback method called by `path_finder` with the result of matching
58
+ # the route.
59
+ #
60
+ # @param route [Wayfarer::Routing::Route] the current route
61
+ # @param result [true, false] routing result
62
+ # @param path_finder [Wayfarer::Routing::PathFinder] the path finder
63
+ def call(route, result, path_finder)
64
+ node = (nodes[route] ||= attributes(route, result, path_finder))
65
+ parent = route.parent
70
66
 
71
- def highlight_options(route, string)
72
- return string unless path_finder.path.include?(route)
67
+ return @root_hash ||= node unless parent
73
68
 
74
- set_color(string, :green, :bold)
69
+ nodes.dig(parent, route_type(parent), :children).append(node)
75
70
  end
76
71
 
77
- def matcher_label(route)
78
- return "Target" if route.is_a?(Wayfarer::Routing::TargetRoute)
79
-
80
- route.matcher.class.name.demodulize
81
- end
72
+ private
82
73
 
83
- def options(route)
84
- return "" if route.is_a?(Wayfarer::Routing::RootRoute)
85
-
86
- case (matcher = route.matcher)
87
- when Wayfarer::Routing::Matchers::Host then matcher.host
88
- when Wayfarer::Routing::Matchers::Path then matcher.path
89
- when Wayfarer::Routing::Matchers::Query then matcher.fields
90
- when Wayfarer::Routing::Matchers::Custom then "##{route.action}"
91
- when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
92
- when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
93
- end
74
+ attr_reader :route,
75
+ :path_finder,
76
+ :serializer,
77
+ :nodes,
78
+ :root_hash
79
+
80
+ def routing_result(path_finder)
81
+ return { routed: false } unless path_finder.found?
82
+
83
+ action = path_finder.action
84
+ { routed: true,
85
+ params: path_finder.params,
86
+ action: case action
87
+ when Array
88
+ { handler: action.first.class.name, action: action.second }
89
+ else action
90
+ end }
94
91
  end
95
92
 
96
- def params(route)
97
- params = route.matcher.params(url)
98
- "=> #{params.symbolize_keys}" if params.any?
93
+ def attributes(route, result, path_finder)
94
+ { route_type(route) => route.to_h.merge!(
95
+ route_result(route, result, path_finder),
96
+ children: []
97
+ ) }
99
98
  end
100
99
 
101
- private
102
-
103
- def parents(route, current = [])
104
- return current unless route.parent
100
+ def route_result(route, result, path_finder)
101
+ return {} if route.target?
105
102
 
106
- parents(route.parent, [route.parent, *current])
103
+ { match: result,
104
+ params: path_finder.params_stack.to_h }
107
105
  end
108
106
 
109
- def trailer?(route)
110
- return true unless route.parent
111
-
112
- route.parent.children.last == route
107
+ def route_type(route)
108
+ route.class.name.demodulize.underscore
113
109
  end
114
110
  end
115
111
  end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ class CLI < Thor
5
+ def self.exit_on_failure?
6
+ true
7
+ end
8
+
9
+ desc "version", "Print version"
10
+ def version
11
+ say Wayfarer::VERSION::STRING
12
+ end
13
+
14
+ class_option :require, aliases: :r, type: :string, default: nil
15
+
16
+ desc "route JOB URL", "Routing tree for URL for JOB"
17
+ option :format, type: :string, enum: %w[yaml json ruby], default: "yaml"
18
+ def route(job, url)
19
+ load_environment
20
+
21
+ url = parsed_url(url)
22
+ job = job.camelize.constantize
23
+ route = job.route
24
+
25
+ Wayfarer::CLI::RoutePrinter.print(route, url, format: options.fetch("format"))
26
+ end
27
+
28
+ desc "perform JOB URL", "Perform JOB with URL"
29
+ option :mock_redis, type: :boolean
30
+ option :batch, type: :string, default: SecureRandom.uuid
31
+ def perform(job, url)
32
+ load_environment
33
+ mock_redis
34
+
35
+ job = job.camelize.constantize
36
+ task = Wayfarer::Task.new(url, options.fetch(:batch))
37
+ job.new(task).perform_now
38
+ end
39
+
40
+ desc "enqueue JOB URL", "Enqueue JOB with URL"
41
+ option :batch, type: :string, default: SecureRandom.uuid
42
+ def enqueue(job, url)
43
+ load_environment
44
+
45
+ job = job.camelize.constantize
46
+ job.crawl(url, batch: options[:batch])
47
+ end
48
+
49
+ desc "execute JOB URL", "Execute JOB with async adapter starting from URL"
50
+ option :mock_redis, type: :boolean
51
+ option :batch, type: :string, default: SecureRandom.uuid
52
+ option :min_threads, type: :numeric, default: 1
53
+ option :max_threads, type: :numeric, default: 1
54
+ option :retain_pool, type: :boolean, default: false
55
+ def execute(job, url)
56
+ load_environment
57
+ mock_redis
58
+
59
+ job = job.camelize.constantize
60
+ job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
61
+ max_threads: options[:max_threads])
62
+ scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
63
+ executor = scheduler.instance_variable_get(:@async_executor)
64
+
65
+ job.crawl(url, batch: options.fetch(:batch))
66
+
67
+ sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
68
+
69
+ # Used in test suite to avoid pool recreation
70
+ Wayfarer::Networking::Pool.instance.free unless options.fetch(:retain_pool)
71
+ end
72
+
73
+ private
74
+
75
+ def mock_redis
76
+ Wayfarer.config[:redis][:factory] = ->(_) { MockRedis.new } if options[:mock_redis]
77
+ end
78
+
79
+ def parsed_url(url)
80
+ Addressable::URI.parse(url).normalize
81
+ end
82
+
83
+ def load_environment(require_path = options[:require])
84
+ require File.join(Dir.pwd, require_path) if require_path
85
+
86
+ load_rails
87
+ end
88
+
89
+ def load_rails
90
+ begin
91
+ require "rails/app_loader"
92
+ rescue LoadError
93
+ return
94
+ end
95
+
96
+ return unless Rails::AppLoader.find_executable
97
+
98
+ require File.expand_path("config/application", Dir.pwd)
99
+ require File.expand_path("config/boot", Dir.pwd)
100
+ require File.expand_path("config/environment", Dir.pwd)
101
+ end
102
+ end
103
+ end
data/lib/wayfarer/gc.rb CHANGED
@@ -1,14 +1,26 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- GC = Struct.new(:task) do
5
- def run
6
- return unless task.counter.decrement <= 0
4
+ # TODO: Add logging
5
+ module GC
6
+ RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
7
7
 
8
- task.metadata.job.run_callbacks(:batch)
8
+ class << self
9
+ include Wayfarer::Logging.emit(
10
+ after_batch: [:debug, "Running `after_batch` callback"],
11
+ gc: [:debug, "Garbage collecting %<resettable>s"]
12
+ )
13
+ end
14
+
15
+ module_function
16
+
17
+ def run(task)
18
+ task[:job].run_callbacks(:batch)
9
19
 
10
- task.barrier.reset!
11
- task.counter.reset!
20
+ RESETTABLES.each do |resettable|
21
+ log(:gc, task, resettable: resettable)
22
+ resettable.new(task).reset!
23
+ end
12
24
  end
13
25
  end
14
26
  end