wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
@@ -1,15 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- class Handler
5
- include Wayfarer::Middleware::Controller
4
+ module Handler
5
+ extend ActiveSupport::Concern
6
6
 
7
- api Wayfarer::Middleware::Fetch
8
- api Wayfarer::Middleware::Stage
7
+ included do
8
+ include Wayfarer::Middleware::Controller
9
9
 
10
- use Wayfarer::Middleware::Router
11
- use Wayfarer::Middleware::Dispatch
10
+ use Wayfarer::Middleware::ContentType
11
+ use Wayfarer::Middleware::Router
12
+ use Wayfarer::Middleware::Dispatch
12
13
 
13
- singleton_class.undef_method :after_batch
14
+ api Wayfarer::Middleware::UserAgent
15
+ api Wayfarer::Middleware::Stage
16
+
17
+ singleton_class.undef_method :before_fetch
18
+ singleton_class.undef_method :around_fetch
19
+ singleton_class.undef_method :after_fetch
20
+ singleton_class.undef_method :after_batch
21
+ end
14
22
  end
15
23
  end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # Provides a key-value store via `[]` and `[]=`.
5
+ #
6
+ # @api private
7
+ module KV
8
+ # @param key [Object] key to fetch
9
+ # @return [Object, nil] value associated with the key or `nil`
10
+ def [](key)
11
+ kv[key]
12
+ end
13
+
14
+ # @param key [Object] key to set
15
+ # @param value [Object] value to set
16
+ # @return [Object] value that was set
17
+ def []=(key, value)
18
+ kv[key] = value
19
+ end
20
+
21
+ private
22
+
23
+ # @return [Hash<Object, Object>]
24
+ def kv
25
+ @kv ||= {}
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Logging
5
+ mattr_accessor :logger, default: ActiveSupport::Logger.new($stdout)
6
+
7
+ def self.emit(...)
8
+ Emitter.new(...)
9
+ end
10
+
11
+ class Emitter < Module
12
+ def initialize(messages)
13
+ @messages = messages
14
+
15
+ super()
16
+ end
17
+
18
+ def included(base)
19
+ messages = @messages
20
+
21
+ base.class_eval do
22
+ define_method(:log) do |key, task, **args|
23
+ level, msg = messages[key] || raise(ArgumentError, "No log message for #{key.inspect}")
24
+ severity = ActiveSupport::Logger::Severity.const_get(level.upcase)
25
+
26
+ ActiveSupport::TaggedLogging
27
+ .new(Logging.logger)
28
+ .tagged(task.batch, task.url, task[:controller]&.class&.name) do |logger|
29
+ logger.add(severity, msg % args)
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ private_constant :Emitter
37
+ end
38
+ end
@@ -3,6 +3,8 @@
3
3
  module Wayfarer
4
4
  module Middleware
5
5
  module Base
6
+ extend ActiveSupport::Concern
7
+
6
8
  API_MODULE = :API
7
9
 
8
10
  def api
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class BatchCompletion
6
+ extend Base
7
+
8
+ def call(task)
9
+ # Comparing to the initial state of `exception_executions` allows
10
+ # us to determine if an exception occurred when the job was performed,
11
+ # since the `perform.active_job` event is emitted for both successful
12
+ # and raising jobs.
13
+ task[:initial_exception_executions] ||= task[:job].exception_executions.clone
14
+
15
+ yield if block_given?
16
+ end
17
+ end
18
+ end
19
+ end
@@ -2,9 +2,15 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Middleware
5
- Chain = Struct.new(:middlewares) do
5
+ class Chain
6
6
  extend Forwardable
7
7
 
8
+ attr_reader :middlewares
9
+
10
+ def initialize(middlewares)
11
+ @middlewares = middlewares
12
+ end
13
+
8
14
  def self.empty
9
15
  new([])
10
16
  end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class ContentType
6
+ extend Base
7
+
8
+ module API
9
+ extend ActiveSupport::Concern
10
+
11
+ included do
12
+ class_attribute :allowed_content_types,
13
+ default: { index: {}, patterns: Set.new },
14
+ instance_accessor: false,
15
+ instance_predicate: false
16
+ end
17
+
18
+ class_methods do
19
+ def content_type(*types)
20
+ grouped_types = types.group_by(&:class)
21
+
22
+ new_index = grouped_types.fetch(String, []).index_with { true }
23
+ new_patterns = grouped_types.fetch(Regexp, [])
24
+
25
+ self.allowed_content_types = {
26
+ index: allowed_content_types.fetch(:index).merge(new_index),
27
+ patterns: allowed_content_types.fetch(:patterns).merge(new_patterns)
28
+ }
29
+ end
30
+ end
31
+ end
32
+
33
+ def call(task)
34
+ yield if block_given? && permitted?(task)
35
+ end
36
+
37
+ private
38
+
39
+ def permitted?(task)
40
+ job = task[:controller]
41
+ rules = job.class.allowed_content_types
42
+
43
+ return true if no_restrictions?(rules)
44
+
45
+ return false unless (content_type = task[:page].mime_type&.to_s || task[:page].content_type)
46
+
47
+ content_type && matches_rules?(content_type, rules)
48
+ end
49
+
50
+ def matches_rules?(type, rules)
51
+ rules.fetch(:index).key?(type) || rules.fetch(:patterns).any? { |p| p.match?(type) }
52
+ end
53
+
54
+ def no_restrictions?(rules)
55
+ rules.values_at(:index, :patterns).all?(&:empty?)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -3,16 +3,20 @@
3
3
  module Wayfarer
4
4
  module Middleware
5
5
  module Controller
6
- def self.included(base)
7
- base.cattr_accessor :chain, default: Chain.empty
8
- base.attr_accessor :task
6
+ extend ActiveSupport::Concern
9
7
 
10
- base.extend(ClassMethods)
11
- base.include(InstanceMethods)
12
- base.include(Wayfarer::Callbacks)
8
+ included do
9
+ class_attribute :chain,
10
+ default: Chain.empty,
11
+ instance_accessor: false,
12
+ instance_predicate: false
13
+
14
+ attr_accessor :task
15
+
16
+ include Wayfarer::Callbacks
13
17
  end
14
18
 
15
- module ClassMethods
19
+ class_methods do
16
20
  def use(middleware)
17
21
  chain.push(middleware.lazy)
18
22
  api(middleware)
@@ -23,17 +27,17 @@ module Wayfarer
23
27
  end
24
28
  end
25
29
 
26
- module InstanceMethods
27
- def call(task)
28
- self.task = task
30
+ def call(task)
31
+ self.task = task
29
32
 
30
- task.metadata.job ||= self
31
- task.metadata.controller = self
33
+ task[:job] ||= self
34
+ task[:controller] = self
32
35
 
33
- self.class.chain.call(task) do
34
- yield if block_given?
35
- end
36
+ self.class.chain.call(task) do
37
+ yield if block_given?
36
38
  end
39
+
40
+ task[:return_value]
37
41
  end
38
42
  end
39
43
  end
@@ -5,25 +5,34 @@ module Wayfarer
5
5
  class Dedup
6
6
  extend Base
7
7
 
8
+ include Wayfarer::Logging.emit(
9
+ deduplicated: [:info, "Deduplicated URL"],
10
+ retry: [:debug, "Not deduplicating retry"],
11
+ rerouted: [:debug, "Not deduplicating rerouted task"]
12
+ )
13
+
8
14
  def call(task)
9
- # Was task routed by a previous controller already?
10
- return yield if task.metadata.action
15
+ task[:barrier] ||= Wayfarer::Redis::Barrier.new(task)
11
16
 
12
- return if task.barrier.seen?(task.url)
17
+ if task[:job].executions > 1
18
+ log(:retry, task)
19
+ return (yield if block_given?)
20
+ end
13
21
 
14
- begin
15
- yield if block_given?
16
- rescue StandardError => e
17
- task.barrier.unsee(task.url)
18
- raise e
22
+ if task[:job] != task[:controller]
23
+ log(:rerouted, task)
24
+ return (yield if block_given?)
19
25
  end
20
26
 
21
- staged_urls = task.metadata.staged_urls
22
- return if staged_urls.none?
27
+ return log(:deduplicated, task) if task[:barrier].check!(key(task))
28
+
29
+ yield if block_given?
30
+ end
31
+
32
+ private
23
33
 
24
- inclusion = task.barrier.peek(staged_urls.to_a)
25
- unseen = staged_urls.zip(inclusion).reject { |_, seen| seen }.map(&:first)
26
- task.metadata.staged_urls = SortedSet.new(unseen)
34
+ def key(task)
35
+ Wayfarer.config.dig(:deduplication, :key).call(task)
27
36
  end
28
37
  end
29
38
  end
@@ -5,13 +5,26 @@ module Wayfarer
5
5
  class Dispatch
6
6
  extend Base
7
7
 
8
+ InvalidTargetError = Class.new(ArgumentError)
9
+
8
10
  def call(task)
9
- controller = task.metadata.controller
11
+ controller = task[:controller]
10
12
 
11
- controller.run_callbacks(:action) do
12
- case action = task.metadata.action
13
+ task[:return_value] = controller.run_callbacks(:action) do
14
+ case action = task[:action]
13
15
  when Symbol then controller.public_send(action)
14
- else action.new.call(task)
16
+ when Array
17
+ handler, method = action
18
+ task[:action] = method
19
+ handler.new.call(task)
20
+ else
21
+ unless action&.include?(Wayfarer::Handler)
22
+ raise InvalidTargetError, "routed to invalid action: #{action.inspect}"
23
+ end
24
+
25
+ task[:action] = nil
26
+
27
+ action.new.call(task)
15
28
  end
16
29
  end
17
30
 
@@ -4,23 +4,16 @@ module Wayfarer
4
4
  module Middleware
5
5
  class Normalize
6
6
  extend Base
7
+ include Wayfarer::Logging.emit(
8
+ invalid: [:info, "Failed to normalize URL"]
9
+ )
7
10
 
8
11
  def call(task)
9
- yield if block_given?
10
-
11
- task.metadata.staged_urls = SortedSet.new(normalized_urls(task).compact)
12
- end
13
-
14
- private
12
+ Wayfarer::URI::Normalization.canonical!(task[:uri])
15
13
 
16
- def normalized_urls(task)
17
- task.metadata.staged_urls.map(&method(:normalize))
18
- end
19
-
20
- def normalize(url)
21
- NormalizeUrl.process(url)
22
- rescue NormalizeUrl::InvalidURIError
23
- nil
14
+ yield if block_given?
15
+ rescue Wayfarer::URI::Normalization::InvalidURIError
16
+ log(:invalid, task)
24
17
  end
25
18
  end
26
19
  end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class Redis
6
+ extend Base
7
+
8
+ def call(task)
9
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
10
+
11
+ yield if block_given?
12
+ end
13
+ end
14
+ end
15
+ end
@@ -5,51 +5,49 @@ module Wayfarer
5
5
  class Router
6
6
  extend Base
7
7
 
8
- module API
9
- def self.included(base)
10
- base.extend(ClassMethods)
11
- base.include(InstanceMethods)
12
- end
13
-
14
- module ClassMethods
15
- def router
16
- # TODO: Use cattr_accessor
17
- @router ||= Wayfarer::Routing::Router.new
18
- end
8
+ include Wayfarer::Logging.emit(
9
+ mismatch: [:info, "No matching route"],
10
+ match: [:info, "Routing to %<action>s"],
11
+ already_routed: [:debug, "Already routed to %<action>s"]
12
+ )
19
13
 
20
- def route(&block)
21
- router.draw(&block) if block_given?
22
- end
14
+ module API
15
+ extend ActiveSupport::Concern
23
16
 
24
- def steer(&block)
25
- define_method(:steer) { block.call(task) }
26
- end
17
+ included do
18
+ class_attribute :route,
19
+ default: Wayfarer::Routing::RootRoute.new,
20
+ instance_accessor: false,
21
+ instance_predicate: false
27
22
  end
28
23
 
29
- module InstanceMethods
30
- def steer
31
- []
32
- end
24
+ def action
25
+ task[:action]
26
+ end
33
27
 
34
- def params
35
- task.metadata.params
36
- end
28
+ def params
29
+ task[:params]
37
30
  end
38
31
  end
39
32
 
40
33
  def call(task)
41
- controller = task.metadata.controller
42
- # TODO: The router has to be cloned because it's not thread-safe
43
- router = controller.class.router.clone
44
- url = Addressable::URI.parse(task.url)
45
-
46
- case result = router.invoke(url, controller.steer)
47
- when Routing::Result::Mismatch
48
- return
34
+ # Avoid rerouting when dispatching a [Controller, :action] pair
35
+ if (action = task[:action])
36
+ log(:already_routed, task, action: action)
37
+
38
+ return (yield if block_given?)
39
+ end
40
+
41
+ case result = task[:controller].class.route.invoke(task)
42
+ when Routing::Result::Mismatch then return log(:mismatch, task)
49
43
  when Routing::Result::Match
50
- task.metadata.action = result.action
51
- task.metadata.params ||= ActiveSupport::HashWithIndifferentAccess.new
52
- task.metadata.params.merge!(result.params)
44
+ action = result.action
45
+
46
+ log(:match, task, action: action.inspect)
47
+
48
+ task[:action] = action
49
+ task[:params] ||= ActiveSupport::HashWithIndifferentAccess.new
50
+ task[:params].merge!(result.params)
53
51
  end
54
52
 
55
53
  yield if block_given?
@@ -7,20 +7,20 @@ module Wayfarer
7
7
 
8
8
  module API
9
9
  def stage(urls)
10
- Array.wrap(urls).each { |url| task.metadata.staged_urls.add(url.to_s) }
10
+ Array.wrap(urls).each { |url| task[:staged_urls].add(url.to_s) }
11
11
  end
12
12
  end
13
13
 
14
14
  def call(task)
15
- task.metadata.staged_urls = SortedSet.new
15
+ task[:staged_urls] = Set.new
16
16
 
17
17
  yield if block_given?
18
18
 
19
- task.metadata.staged_urls.each do |url|
20
- task.metadata.job.class.crawl(url, batch: task.batch)
19
+ task[:staged_urls].each do |url|
20
+ task[:job].class.crawl(url, batch: task.batch)
21
21
  end
22
22
 
23
- task.metadata.staged_urls.clear
23
+ task[:staged_urls].clear
24
24
  end
25
25
  end
26
26
  end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class UriParser
6
+ extend Base
7
+
8
+ include Wayfarer::Logging.emit(
9
+ invalid: [:info, "Not processing invalid URL (%<message>s)"]
10
+ )
11
+
12
+ module API
13
+ def uri
14
+ task[:uri]
15
+ end
16
+ end
17
+
18
+ def call(task)
19
+ return (yield if block_given?) if task[:uri]
20
+
21
+ begin
22
+ task[:uri] = Addressable::URI.parse(task.url)
23
+ rescue Addressable::URI::InvalidURIError => e
24
+ return log(:invalid, task, message: e.message)
25
+ end
26
+
27
+ yield if block_given?
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class UserAgent
6
+ extend Base
7
+
8
+ module API
9
+ def user_agent
10
+ task[:context]&.instance
11
+ end
12
+
13
+ def page(live: false)
14
+ return task[:page] unless live
15
+
16
+ task[:page] = task[:context].live&.page || task[:page]
17
+ end
18
+
19
+ def fetch(url, follow: 3)
20
+ (@http ||= Wayfarer::Networking::Follow.http).fetch(url, follow: follow)
21
+ end
22
+ end
23
+
24
+ def call(task)
25
+ pool.with do |context|
26
+ task[:context] = context
27
+
28
+ result = task[:controller].run_callbacks(:fetch) do
29
+ context.fetch(task.url)
30
+ end
31
+
32
+ case result
33
+ when Networking::Result::Redirect
34
+ task[:controller].stage(result.redirect_url)
35
+ when Networking::Result::Success
36
+ task[:page] = result.page
37
+ yield if block_given?
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def pool
45
+ Wayfarer::Networking::Pool.instance
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,7 +6,7 @@ module Wayfarer
6
6
  include Strategy
7
7
 
8
8
  def create
9
- ::Capybara::Session.new(Wayfarer.config.capybara.driver, nil)
9
+ ::Capybara::Session.new(Wayfarer.config[:capybara][:driver], nil)
10
10
  end
11
11
 
12
12
  def destroy(instance)
@@ -2,11 +2,22 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Networking
5
- Context = Struct.new(:strategy) do
5
+ class Context
6
+ attr_reader :strategy
7
+
8
+ def initialize(strategy)
9
+ @strategy = strategy
10
+ end
11
+
6
12
  def fetch(url)
7
13
  supervise { strategy.fetch(instance, url) }
8
14
  end
9
15
 
16
+ def navigate(url)
17
+ fetch(url)
18
+ live
19
+ end
20
+
10
21
  def live
11
22
  supervise { strategy.live(instance) }
12
23
  end
@@ -25,8 +36,8 @@ module Wayfarer
25
36
 
26
37
  def supervise
27
38
  yield
28
- rescue *strategy.renew_on, *Wayfarer.config.network.renew_on => e
29
- renew
39
+ rescue *strategy.renew_on, *Wayfarer.config[:network][:renew_on] => e
40
+ renew # may raise
30
41
  ensure
31
42
  # If renewing raises, re-raise the originally caught exception
32
43
  # TODO: Not nice this effectively swallows exceptions
@@ -10,13 +10,10 @@ module Wayfarer
10
10
  end
11
11
 
12
12
  def create
13
- ::Ferrum::Browser.new(Wayfarer.config.ferrum.options).tap do |browser|
14
- browser.headers.set(Wayfarer.config.network.http_headers)
15
- end
13
+ ::Ferrum::Browser.new(Wayfarer.config.dig(:ferrum, :options))
16
14
  end
17
15
 
18
16
  def destroy(instance)
19
- instance.reset
20
17
  instance.quit
21
18
  end
22
19
 
@@ -2,17 +2,24 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Networking
5
- RedirectsExhaustedError = Class.new(StandardError)
5
+ class Follow
6
+ RedirectsExhaustedError = Class.new(StandardError)
6
7
 
7
- Follow = Struct.new(:context) do
8
- extend Forwardable
8
+ def self.http
9
+ new(Wayfarer::Networking::Context.new(Wayfarer::Networking::HTTP.new))
10
+ end
11
+
12
+ attr_reader :context
9
13
 
10
- delegate %i[live renew instance] => :context
14
+ def initialize(context)
15
+ @context = context
16
+ end
11
17
 
12
- def fetch(url, follow: 3)
13
- raise RedirectsExhaustedError if follow.negative?
18
+ def fetch(url, follow:)
19
+ raise RedirectsExhaustedError if follow < 0
14
20
 
15
- case result = context.fetch(url)
21
+ result = context.fetch(url)
22
+ case result
16
23
  when Result::Success then result.page
17
24
  when Result::Redirect then fetch(result.redirect_url, follow: follow - 1)
18
25
  end