wayfarer 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +29 -2
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +17 -0
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -31
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -42
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -26
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -2,69 +2,43 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Callbacks
5
- TERMINATOR = ->(_target, result) { result.call == false }
6
- OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
5
+ extend ActiveSupport::Concern
7
6
 
8
- ConditionalCallback = Struct.new(:job, :filters) do
9
- def run(method, &block)
10
- return if only && !applies?(only)
11
- return if except && applies?(except)
7
+ included do
8
+ include ActiveSupport::Callbacks
12
9
 
13
- return job.send(method) if method
10
+ define_callbacks :fetch, skip_after_callbacks_if_terminated: true
11
+ define_callbacks :action, skip_after_callbacks_if_terminated: true
12
+ define_callbacks :batch
13
+ end
14
14
 
15
- job.instance_eval(&block)
15
+ class_methods do
16
+ def before_fetch(*filters, &block)
17
+ set_callback(:fetch, :before, *filters, &block)
16
18
  end
17
19
 
18
- private
19
-
20
- def applies?(condition)
21
- case condition
22
- when Symbol then condition == action
23
- when Enumerable then condition&.include?(action)
24
- end
20
+ def around_fetch(*filters, &block)
21
+ set_callback(:fetch, :around, *filters, &block)
25
22
  end
26
23
 
27
- def only
28
- filters[:only]
24
+ def after_fetch(*filters, &block)
25
+ set_callback(:fetch, :after, *filters, &block)
29
26
  end
30
27
 
31
- def except
32
- filters[:except]
28
+ def before_action(*filters, &block)
29
+ set_callback(:action, :before, *filters, &block)
33
30
  end
34
31
 
35
- def action
36
- task.metadata.action
32
+ def around_action(*filters, &block)
33
+ set_callback(:action, :around, *filters, &block)
37
34
  end
38
35
 
39
- def task
40
- job.task
36
+ def after_action(*filters, &block)
37
+ set_callback(:action, :after, *filters, &block)
41
38
  end
42
- end
43
-
44
- def self.included(base)
45
- base.include(ActiveSupport::Callbacks)
46
- base.extend(ClassMethods)
47
-
48
- base.class_eval do
49
- define_callbacks(:fetch, OPTIONS)
50
- define_callbacks(:action, OPTIONS)
51
- define_callbacks(:batch, OPTIONS)
52
-
53
- define(:fetch, :before)
54
- define(:action, :before)
55
- define(:batch, :after)
56
- end
57
- end
58
-
59
- module ClassMethods
60
- private
61
39
 
62
- def define(name, stage)
63
- define_singleton_method([stage, name].join("_")) do |method = nil, **filters, &block|
64
- set_callback(name, stage, **filters) do |job|
65
- ConditionalCallback.new(job, filters).run(method, &block)
66
- end
67
- end
40
+ def after_batch(*filters, &block)
41
+ set_callback(:batch, :after, *filters, &block)
68
42
  end
69
43
  end
70
44
  end
@@ -1,16 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- module CLI
4
+ class CLI
5
5
  class RoutePrinter < Thor::Shell::Color
6
- attr_reader :url,
7
- :path_finder,
8
- :output
6
+ attr_reader :url, :path_finder, :output
9
7
 
10
- INDENT = " "
11
- REGULAR_SEGMENT = "│ "
8
+ INDENT = " "
9
+ REGULAR_SEGMENT = "│ "
12
10
  JUNCTION_SEGMENT = "├──"
13
- CORNER_SEGMENT = "└──"
11
+ CORNER_SEGMENT = "└──"
14
12
 
15
13
  def self.print(route, url)
16
14
  route.accept(new(url))
@@ -24,81 +22,83 @@ module Wayfarer
24
22
 
25
23
  def visit(route)
26
24
  route.accept(path_finder) unless route.parent
27
- return true if route.is_a?(Wayfarer::Routing::RootRoute)
28
-
29
- puts [segments(route), label(route)].join("")[3..]
25
+ puts format_route_output(route)
30
26
  true
31
27
  end
32
28
 
29
+ private
30
+
31
+ def format_route_output(route)
32
+ [segments(route), route_description(route)].join[3..]
33
+ end
34
+
33
35
  def segments(route)
34
- current = segment(route)
35
- parents = parents(route).map { |parent| parent_segment(parent) }
36
- [parents, current].join
36
+ [parents(route).map { |parent| parent_segment(parent) }, segment(route)].join
37
37
  end
38
38
 
39
39
  def parent_segment(parent)
40
- if trailer?(parent)
41
- INDENT
42
- else
43
- REGULAR_SEGMENT
44
- end
40
+ trailer?(parent) ? INDENT : REGULAR_SEGMENT
45
41
  end
46
42
 
47
43
  def segment(route)
48
- if trailer?(route)
49
- CORNER_SEGMENT
50
- else
51
- JUNCTION_SEGMENT
52
- end
44
+ trailer?(route) ? CORNER_SEGMENT : JUNCTION_SEGMENT
53
45
  end
54
46
 
55
- def label(route)
56
- [highlight_matcher(route, matcher_label(route)),
57
- highlight_options(route, options(route)),
58
- highlight_options(route, params(route))].compact.join(" ")
47
+ def route_description(route)
48
+ attrs = [route_arg(route), routing_result(route), route_action(route), route_params(route)].compact
49
+ text = attrs.any? ? "#{matcher_name(route)}(#{attrs.join(', ')})" : matcher_name(route)
50
+ set_color(text, *route_colors(route))
59
51
  end
60
52
 
61
- def highlight_matcher(route, string)
62
- if path_finder.path.include?(route)
63
- set_color(string, :green, :bold)
64
- elsif route.matcher.match(url)
65
- set_color(string, :green)
53
+ def matcher_name(route)
54
+ case route
55
+ when Wayfarer::Routing::TargetRoute
56
+ "Target"
57
+ when Wayfarer::Routing::RootRoute
58
+ Wayfarer::Routing::PathFinder.result(route, url).class.name.demodulize
66
59
  else
67
- set_color(string, :red)
60
+ route.matcher.class.name.demodulize
68
61
  end
69
62
  end
70
63
 
71
- def highlight_options(route, string)
72
- return string unless path_finder.path.include?(route)
64
+ def routing_result(route)
65
+ return if route.is_a?(Wayfarer::Routing::RootRoute)
73
66
 
74
- set_color(string, :green, :bold)
67
+ "match: #{route.matcher.match(url)}"
75
68
  end
76
69
 
77
- def matcher_label(route)
78
- return "Target" if route.is_a?(Wayfarer::Routing::TargetRoute)
70
+ def route_action(route)
71
+ return unless route.is_a?(Wayfarer::Routing::RootRoute)
79
72
 
80
- route.matcher.class.name.demodulize
73
+ result = Wayfarer::Routing::PathFinder.result(route, url)
74
+ result.action.inspect if result.is_a?(Wayfarer::Routing::Result::Match)
81
75
  end
82
76
 
83
- def options(route)
84
- return "" if route.is_a?(Wayfarer::Routing::RootRoute)
85
-
86
- case (matcher = route.matcher)
87
- when Wayfarer::Routing::Matchers::Host then matcher.host
88
- when Wayfarer::Routing::Matchers::Path then matcher.path
89
- when Wayfarer::Routing::Matchers::Query then matcher.fields
90
- when Wayfarer::Routing::Matchers::Custom then "##{route.action}"
91
- when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
92
- when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
93
- end
77
+ def route_arg(route)
78
+ return if route.is_a?(Wayfarer::Routing::RootRoute) || route.is_a?(Wayfarer::Routing::TargetRoute)
79
+
80
+ matcher = route.matcher
81
+ matcher_opts = case matcher
82
+ when Wayfarer::Routing::Matchers::Host then matcher.host
83
+ when Wayfarer::Routing::Matchers::Path then matcher.path
84
+ when Wayfarer::Routing::Matchers::Query then matcher.fields
85
+ when Wayfarer::Routing::Matchers::Custom then route.action.to_s
86
+ when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
87
+ when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
88
+ end
89
+ matcher_opts.inspect
94
90
  end
95
91
 
96
- def params(route)
97
- params = route.matcher.params(url)
98
- "=> #{params.symbolize_keys}" if params.any?
99
- end
92
+ def route_params(route)
93
+ params = if route.is_a?(Wayfarer::Routing::RootRoute)
94
+ result = Wayfarer::Routing::PathFinder.result(route, url)
95
+ result.params if result.is_a?(Wayfarer::Routing::Result::Match)
96
+ else
97
+ route.matcher.params(url)
98
+ end
100
99
 
101
- private
100
+ "params: #{params.symbolize_keys}" if params&.any?
101
+ end
102
102
 
103
103
  def parents(route, current = [])
104
104
  return current unless route.parent
@@ -107,9 +107,23 @@ module Wayfarer
107
107
  end
108
108
 
109
109
  def trailer?(route)
110
- return true unless route.parent
110
+ !route.parent || route.parent.children.last == route
111
+ end
112
+
113
+ def route_colors(route)
114
+ if path_finder.path.include?(route)
115
+ %i[green bold]
116
+ elsif route.matcher.match(url)
117
+ %i[green]
118
+ else
119
+ %i[red]
120
+ end
121
+ end
122
+
123
+ def set_color(string, *colors)
124
+ return string if ENV.key?("NO_COLOR")
111
125
 
112
- route.parent.children.last == route
126
+ super(string, *colors)
113
127
  end
114
128
  end
115
129
  end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ class CLI < Thor
5
+ def self.exit_on_failure?
6
+ true
7
+ end
8
+
9
+ desc "version", "Print version"
10
+ def version
11
+ say Wayfarer::VERSION::STRING
12
+ end
13
+
14
+ class_option :require, aliases: :r, type: :string, default: nil
15
+
16
+ desc "route JOB URL", "Routing result of URL for JOB"
17
+ def route(job, url)
18
+ load_environment
19
+
20
+ url = parsed_url(url)
21
+ job = job.classify.constantize
22
+ route = job.route
23
+ route.invoke(url)
24
+
25
+ result = Wayfarer::Routing::PathFinder.result(route, url)
26
+ result_type = result.class.name.demodulize
27
+
28
+ say case result
29
+ when Wayfarer::Routing::Result::Match
30
+ "#{result_type} => #{result.action.inspect}"
31
+ else
32
+ result_type
33
+ end
34
+ end
35
+
36
+ desc "tree JOB URL", "Visualize JOB's routing tree for URL"
37
+ def tree(job, url)
38
+ load_environment
39
+
40
+ url = parsed_url(url)
41
+ job = job.classify.constantize
42
+ route = job.route
43
+ route.invoke(url)
44
+
45
+ Wayfarer::CLI::RoutePrinter.print(route, url)
46
+ end
47
+
48
+ desc "perform JOB URL", "Perform JOB with URL"
49
+ option :mock_redis, type: :boolean
50
+ option :batch, type: :string, default: SecureRandom.uuid
51
+ def perform(job, url)
52
+ load_environment
53
+ mock_redis
54
+
55
+ job = job.classify.constantize
56
+ task = Wayfarer::Task.new(url, options[:batch])
57
+ job.new(task).perform_now
58
+ end
59
+
60
+ desc "enqueue JOB URL", "Enqueue JOB with URL"
61
+ option :batch, type: :string, default: SecureRandom.uuid
62
+ def enqueue(job, url)
63
+ load_environment
64
+
65
+ job = job.classify.constantize
66
+ job.crawl(url, batch: options[:batch])
67
+ end
68
+
69
+ desc "execute JOB URL", "Execute JOB with async adapter starting from URL"
70
+ option :mock_redis, type: :boolean
71
+ option :batch, type: :string, default: SecureRandom.uuid
72
+ option :min_threads, type: :numeric, default: 1
73
+ option :max_threads, type: :numeric, default: 1
74
+ def execute(job, url)
75
+ load_environment
76
+ mock_redis
77
+
78
+ job = job.classify.constantize
79
+ job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
80
+ max_threads: options[:max_threads])
81
+ scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
82
+ executor = scheduler.instance_variable_get(:@async_executor)
83
+
84
+ job.crawl(url, batch: options[:batch])
85
+
86
+ sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
87
+
88
+ Wayfarer::Networking::Pool.instance.free
89
+ end
90
+
91
+ private
92
+
93
+ def mock_redis
94
+ Wayfarer.config[:redis][:factory] = ->(_) { MockRedis.new } if options[:mock_redis]
95
+ end
96
+
97
+ def parsed_url(url)
98
+ Addressable::URI.parse(url).normalize
99
+ end
100
+
101
+ def load_environment(require_path = options[:require])
102
+ require File.join(Dir.pwd, require_path) if require_path
103
+
104
+ load_rails
105
+ end
106
+
107
+ def load_rails
108
+ begin
109
+ require "rails/app_loader"
110
+ rescue LoadError
111
+ return
112
+ end
113
+
114
+ return unless Rails::AppLoader.find_executable
115
+
116
+ require File.expand_path("config/application", Dir.pwd)
117
+ require File.expand_path("config/boot", Dir.pwd)
118
+ require File.expand_path("config/environment", Dir.pwd)
119
+ end
120
+ end
121
+ end
data/lib/wayfarer/gc.rb CHANGED
@@ -1,14 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- GC = Struct.new(:task) do
5
- def run
6
- return unless task.counter.decrement <= 0
4
+ # TODO: Add logging
5
+ module GC
6
+ RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
7
7
 
8
- task.metadata.job.run_callbacks(:batch)
8
+ class << self
9
+ include Wayfarer::Logging.emit(gc: [:info, "Garbage collecting %<resettable>s"])
10
+ end
11
+
12
+ module_function
9
13
 
10
- task.barrier.reset!
11
- task.counter.reset!
14
+ def run(task)
15
+ RESETTABLES.each do |resettable|
16
+ log(:gc, task, resettable: resettable)
17
+ resettable.new(task).reset!
18
+ end
12
19
  end
13
20
  end
14
21
  end
@@ -1,15 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- class Handler
5
- include Wayfarer::Middleware::Controller
4
+ module Handler
5
+ extend ActiveSupport::Concern
6
6
 
7
- api Wayfarer::Middleware::Fetch
8
- api Wayfarer::Middleware::Stage
7
+ included do
8
+ include Wayfarer::Middleware::Controller
9
9
 
10
- use Wayfarer::Middleware::Router
11
- use Wayfarer::Middleware::Dispatch
10
+ use Wayfarer::Middleware::ContentType
11
+ use Wayfarer::Middleware::Router
12
+ use Wayfarer::Middleware::Dispatch
12
13
 
13
- singleton_class.undef_method :after_batch
14
+ api Wayfarer::Middleware::UserAgent
15
+ api Wayfarer::Middleware::Stage
16
+
17
+ singleton_class.undef_method :before_fetch
18
+ singleton_class.undef_method :around_fetch
19
+ singleton_class.undef_method :after_fetch
20
+ singleton_class.undef_method :after_batch
21
+ end
14
22
  end
15
23
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Logging
5
+ mattr_accessor :logger, default: ActiveSupport::Logger.new($stdout)
6
+
7
+ def self.emit(...)
8
+ Emitter.new(...)
9
+ end
10
+
11
+ class Emitter < Module
12
+ def initialize(messages)
13
+ @messages = messages
14
+
15
+ super()
16
+ end
17
+
18
+ def included(base)
19
+ messages = @messages
20
+
21
+ base.class_eval do
22
+ define_method(:log) do |key, task, **args|
23
+ level, msg = messages[key] || raise(ArgumentError, "No log message for #{key.inspect}")
24
+ severity = ActiveSupport::Logger::Severity.const_get(level.upcase)
25
+
26
+ ActiveSupport::TaggedLogging
27
+ .new(Logging.logger)
28
+ .tagged(task.batch, task.url, task[:controller]&.class&.name) do |logger|
29
+ logger.add(severity, msg % args)
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ private_constant :Emitter
37
+ end
38
+ end
@@ -3,6 +3,8 @@
3
3
  module Wayfarer
4
4
  module Middleware
5
5
  module Base
6
+ extend ActiveSupport::Concern
7
+
6
8
  API_MODULE = :API
7
9
 
8
10
  def api
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class BatchCompletion
6
+ extend Base
7
+
8
+ def call(task)
9
+ # Comparing to the initial state of `exception_executions` allows
10
+ # us to determine if an exception occurred when the job was performed,
11
+ # since the `perform.active_job` event is emitted for both successful
12
+ # and raising jobs.
13
+ task[:initial_exception_executions] ||= task[:job].exception_executions.clone
14
+
15
+ yield if block_given?
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class ContentType
6
+ extend Base
7
+
8
+ module API
9
+ extend ActiveSupport::Concern
10
+
11
+ included do
12
+ class_attribute :allowed_content_types,
13
+ default: { index: {}, patterns: Set.new },
14
+ instance_accessor: false,
15
+ instance_predicate: false
16
+ end
17
+
18
+ class_methods do
19
+ def content_type(*content_types)
20
+ content_types.each do |content_type|
21
+ case content_type
22
+ when String then allowed_content_types[:index][content_type] = true
23
+ when Regexp then allowed_content_types[:patterns] << content_type
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ def call(task)
31
+ yield if block_given? && permitted?(task)
32
+ end
33
+
34
+ private
35
+
36
+ def permitted?(task)
37
+ job = task[:controller]
38
+ content_types = job.class.allowed_content_types
39
+
40
+ return true if allows_all?(content_types)
41
+
42
+ content_type = task[:page].mime_type&.to_s || task[:page].headers["content-type"] || (return false)
43
+
44
+ content_types[:index].key?(content_type) || content_types[:patterns].any? do |pattern|
45
+ pattern.match?(content_type)
46
+ end
47
+ end
48
+
49
+ def allows_all?(content_types)
50
+ content_types[:index].empty? && content_types[:patterns].empty?
51
+ end
52
+ end
53
+ end
54
+ end
@@ -3,16 +3,20 @@
3
3
  module Wayfarer
4
4
  module Middleware
5
5
  module Controller
6
- def self.included(base)
7
- base.cattr_accessor :chain, default: Chain.empty
8
- base.attr_accessor :task
6
+ extend ActiveSupport::Concern
9
7
 
10
- base.extend(ClassMethods)
11
- base.include(InstanceMethods)
12
- base.include(Wayfarer::Callbacks)
8
+ included do
9
+ class_attribute :chain,
10
+ default: Chain.empty,
11
+ instance_accessor: false,
12
+ instance_predicate: false
13
+
14
+ attr_accessor :task
15
+
16
+ include Wayfarer::Callbacks
13
17
  end
14
18
 
15
- module ClassMethods
19
+ class_methods do
16
20
  def use(middleware)
17
21
  chain.push(middleware.lazy)
18
22
  api(middleware)
@@ -23,17 +27,17 @@ module Wayfarer
23
27
  end
24
28
  end
25
29
 
26
- module InstanceMethods
27
- def call(task)
28
- self.task = task
30
+ def call(task)
31
+ self.task = task
29
32
 
30
- task.metadata.job ||= self
31
- task.metadata.controller = self
33
+ task[:job] ||= self
34
+ task[:controller] = self
32
35
 
33
- self.class.chain.call(task) do
34
- yield if block_given?
35
- end
36
+ self.class.chain.call(task) do
37
+ yield if block_given?
36
38
  end
39
+
40
+ task[:return_value]
37
41
  end
38
42
  end
39
43
  end
@@ -5,25 +5,28 @@ module Wayfarer
5
5
  class Dedup
6
6
  extend Base
7
7
 
8
+ include Wayfarer::Logging.emit(
9
+ deduplicated: [:info, "Deduplicated URL"],
10
+ retry: [:debug, "Not deduplicating retry"],
11
+ rerouted: [:debug, "Not deduplicating rerouted task"]
12
+ )
13
+
8
14
  def call(task)
9
- # Was task routed by a previous controller already?
10
- return yield if task.metadata.action
15
+ task[:barrier] ||= Wayfarer::Redis::Barrier.new(task)
11
16
 
12
- return if task.barrier.seen?(task.url)
17
+ if task[:job].executions > 1
18
+ log(:retry, task)
19
+ return yield if block_given?
20
+ end
13
21
 
14
- begin
15
- yield if block_given?
16
- rescue StandardError => e
17
- task.barrier.unsee(task.url)
18
- raise e
22
+ if task[:job] != task[:controller]
23
+ log(:rerouted, task)
24
+ return yield if block_given?
19
25
  end
20
26
 
21
- staged_urls = task.metadata.staged_urls
22
- return if staged_urls.none?
27
+ return log(:deduplicated, task) if task[:barrier].check!(task[:normalized_url])
23
28
 
24
- inclusion = task.barrier.peek(staged_urls.to_a)
25
- unseen = staged_urls.zip(inclusion).reject { |_, seen| seen }.map(&:first)
26
- task.metadata.staged_urls = SortedSet.new(unseen)
29
+ yield if block_given?
27
30
  end
28
31
  end
29
32
  end