wayfarer 0.4.6 → 0.4.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +28 -1
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +1 -1
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -53
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -43
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -29
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -2,69 +2,43 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Callbacks
5
- TERMINATOR = ->(_target, result) { result.call == false }
6
- OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
5
+ extend ActiveSupport::Concern
7
6
 
8
- ConditionalCallback = Struct.new(:job, :filters) do
9
- def run(method, &block)
10
- return if only && !applies?(only)
11
- return if except && applies?(except)
7
+ included do
8
+ include ActiveSupport::Callbacks
12
9
 
13
- return job.send(method) if method
10
+ define_callbacks :fetch, skip_after_callbacks_if_terminated: true
11
+ define_callbacks :action, skip_after_callbacks_if_terminated: true
12
+ define_callbacks :batch
13
+ end
14
14
 
15
- job.instance_eval(&block)
15
+ class_methods do
16
+ def before_fetch(*filters, &block)
17
+ set_callback(:fetch, :before, *filters, &block)
16
18
  end
17
19
 
18
- private
19
-
20
- def applies?(condition)
21
- case condition
22
- when Symbol then condition == action
23
- when Enumerable then condition&.include?(action)
24
- end
20
+ def around_fetch(*filters, &block)
21
+ set_callback(:fetch, :around, *filters, &block)
25
22
  end
26
23
 
27
- def only
28
- filters[:only]
24
+ def after_fetch(*filters, &block)
25
+ set_callback(:fetch, :after, *filters, &block)
29
26
  end
30
27
 
31
- def except
32
- filters[:except]
28
+ def before_action(*filters, &block)
29
+ set_callback(:action, :before, *filters, &block)
33
30
  end
34
31
 
35
- def action
36
- task.metadata.action
32
+ def around_action(*filters, &block)
33
+ set_callback(:action, :around, *filters, &block)
37
34
  end
38
35
 
39
- def task
40
- job.task
36
+ def after_action(*filters, &block)
37
+ set_callback(:action, :after, *filters, &block)
41
38
  end
42
- end
43
-
44
- def self.included(base)
45
- base.include(ActiveSupport::Callbacks)
46
- base.extend(ClassMethods)
47
-
48
- base.class_eval do
49
- define_callbacks(:fetch, OPTIONS)
50
- define_callbacks(:action, OPTIONS)
51
- define_callbacks(:batch, OPTIONS)
52
-
53
- define(:fetch, :before)
54
- define(:action, :before)
55
- define(:batch, :after)
56
- end
57
- end
58
-
59
- module ClassMethods
60
- private
61
39
 
62
- def define(name, stage)
63
- define_singleton_method([stage, name].join("_")) do |method = nil, **filters, &block|
64
- set_callback(name, stage, **filters) do |job|
65
- ConditionalCallback.new(job, filters).run(method, &block)
66
- end
67
- end
40
+ def after_batch(*filters, &block)
41
+ set_callback(:batch, :after, *filters, &block)
68
42
  end
69
43
  end
70
44
  end
@@ -1,16 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- module CLI
4
+ class CLI
5
5
  class RoutePrinter < Thor::Shell::Color
6
- attr_reader :url,
7
- :path_finder,
8
- :output
6
+ attr_reader :url, :path_finder, :output
9
7
 
10
- INDENT = " "
11
- REGULAR_SEGMENT = "│ "
8
+ INDENT = " "
9
+ REGULAR_SEGMENT = "│ "
12
10
  JUNCTION_SEGMENT = "├──"
13
- CORNER_SEGMENT = "└──"
11
+ CORNER_SEGMENT = "└──"
14
12
 
15
13
  def self.print(route, url)
16
14
  route.accept(new(url))
@@ -24,81 +22,83 @@ module Wayfarer
24
22
 
25
23
  def visit(route)
26
24
  route.accept(path_finder) unless route.parent
27
- return true if route.is_a?(Wayfarer::Routing::RootRoute)
28
-
29
- puts [segments(route), label(route)].join("")[3..]
25
+ puts format_route_output(route)
30
26
  true
31
27
  end
32
28
 
29
+ private
30
+
31
+ def format_route_output(route)
32
+ [segments(route), route_description(route)].join[3..]
33
+ end
34
+
33
35
  def segments(route)
34
- current = segment(route)
35
- parents = parents(route).map { |parent| parent_segment(parent) }
36
- [parents, current].join
36
+ [parents(route).map { |parent| parent_segment(parent) }, segment(route)].join
37
37
  end
38
38
 
39
39
  def parent_segment(parent)
40
- if trailer?(parent)
41
- INDENT
42
- else
43
- REGULAR_SEGMENT
44
- end
40
+ trailer?(parent) ? INDENT : REGULAR_SEGMENT
45
41
  end
46
42
 
47
43
  def segment(route)
48
- if trailer?(route)
49
- CORNER_SEGMENT
50
- else
51
- JUNCTION_SEGMENT
52
- end
44
+ trailer?(route) ? CORNER_SEGMENT : JUNCTION_SEGMENT
53
45
  end
54
46
 
55
- def label(route)
56
- [highlight_matcher(route, matcher_label(route)),
57
- highlight_options(route, options(route)),
58
- highlight_options(route, params(route))].compact.join(" ")
47
+ def route_description(route)
48
+ attrs = [route_arg(route), routing_result(route), route_action(route), route_params(route)].compact
49
+ text = attrs.any? ? "#{matcher_name(route)}(#{attrs.join(', ')})" : matcher_name(route)
50
+ set_color(text, *route_colors(route))
59
51
  end
60
52
 
61
- def highlight_matcher(route, string)
62
- if path_finder.path.include?(route)
63
- set_color(string, :green, :bold)
64
- elsif route.matcher.match(url)
65
- set_color(string, :green)
53
+ def matcher_name(route)
54
+ case route
55
+ when Wayfarer::Routing::TargetRoute
56
+ "Target"
57
+ when Wayfarer::Routing::RootRoute
58
+ Wayfarer::Routing::PathFinder.result(route, url).class.name.demodulize
66
59
  else
67
- set_color(string, :red)
60
+ route.matcher.class.name.demodulize
68
61
  end
69
62
  end
70
63
 
71
- def highlight_options(route, string)
72
- return string unless path_finder.path.include?(route)
64
+ def routing_result(route)
65
+ return if route.is_a?(Wayfarer::Routing::RootRoute)
73
66
 
74
- set_color(string, :green, :bold)
67
+ "match: #{route.matcher.match(url)}"
75
68
  end
76
69
 
77
- def matcher_label(route)
78
- return "Target" if route.is_a?(Wayfarer::Routing::TargetRoute)
70
+ def route_action(route)
71
+ return unless route.is_a?(Wayfarer::Routing::RootRoute)
79
72
 
80
- route.matcher.class.name.demodulize
73
+ result = Wayfarer::Routing::PathFinder.result(route, url)
74
+ result.action.inspect if result.is_a?(Wayfarer::Routing::Result::Match)
81
75
  end
82
76
 
83
- def options(route)
84
- return "" if route.is_a?(Wayfarer::Routing::RootRoute)
85
-
86
- case (matcher = route.matcher)
87
- when Wayfarer::Routing::Matchers::Host then matcher.host
88
- when Wayfarer::Routing::Matchers::Path then matcher.path
89
- when Wayfarer::Routing::Matchers::Query then matcher.fields
90
- when Wayfarer::Routing::Matchers::Custom then "##{route.action}"
91
- when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
92
- when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
93
- end
77
+ def route_arg(route)
78
+ return if route.is_a?(Wayfarer::Routing::RootRoute) || route.is_a?(Wayfarer::Routing::TargetRoute)
79
+
80
+ matcher = route.matcher
81
+ matcher_opts = case matcher
82
+ when Wayfarer::Routing::Matchers::Host then matcher.host
83
+ when Wayfarer::Routing::Matchers::Path then matcher.path
84
+ when Wayfarer::Routing::Matchers::Query then matcher.fields
85
+ when Wayfarer::Routing::Matchers::Custom then route.action.to_s
86
+ when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
87
+ when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
88
+ end
89
+ matcher_opts.inspect
94
90
  end
95
91
 
96
- def params(route)
97
- params = route.matcher.params(url)
98
- "=> #{params.symbolize_keys}" if params.any?
99
- end
92
+ def route_params(route)
93
+ params = if route.is_a?(Wayfarer::Routing::RootRoute)
94
+ result = Wayfarer::Routing::PathFinder.result(route, url)
95
+ result.params if result.is_a?(Wayfarer::Routing::Result::Match)
96
+ else
97
+ route.matcher.params(url)
98
+ end
100
99
 
101
- private
100
+ "params: #{params.symbolize_keys}" if params&.any?
101
+ end
102
102
 
103
103
  def parents(route, current = [])
104
104
  return current unless route.parent
@@ -107,9 +107,23 @@ module Wayfarer
107
107
  end
108
108
 
109
109
  def trailer?(route)
110
- return true unless route.parent
110
+ !route.parent || route.parent.children.last == route
111
+ end
112
+
113
+ def route_colors(route)
114
+ if path_finder.path.include?(route)
115
+ %i[green bold]
116
+ elsif route.matcher.match(url)
117
+ %i[green]
118
+ else
119
+ %i[red]
120
+ end
121
+ end
122
+
123
+ def set_color(string, *colors)
124
+ return string if ENV.key?("NO_COLOR")
111
125
 
112
- route.parent.children.last == route
126
+ super(string, *colors)
113
127
  end
114
128
  end
115
129
  end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ class CLI < Thor
5
+ def self.exit_on_failure?
6
+ true
7
+ end
8
+
9
+ desc "version", "Print version"
10
+ def version
11
+ say Wayfarer::VERSION::STRING
12
+ end
13
+
14
+ class_option :require, aliases: :r, type: :string, default: nil
15
+
16
+ desc "route JOB URL", "Routing result of URL for JOB"
17
+ def route(job, url)
18
+ load_environment
19
+
20
+ url = parsed_url(url)
21
+ job = job.classify.constantize
22
+ route = job.route
23
+ route.invoke(url)
24
+
25
+ result = Wayfarer::Routing::PathFinder.result(route, url)
26
+ result_type = result.class.name.demodulize
27
+
28
+ say case result
29
+ when Wayfarer::Routing::Result::Match
30
+ "#{result_type} => #{result.action.inspect}"
31
+ else
32
+ result_type
33
+ end
34
+ end
35
+
36
+ desc "tree JOB URL", "Visualize JOB's routing tree for URL"
37
+ def tree(job, url)
38
+ load_environment
39
+
40
+ url = parsed_url(url)
41
+ job = job.classify.constantize
42
+ route = job.route
43
+ route.invoke(url)
44
+
45
+ Wayfarer::CLI::RoutePrinter.print(route, url)
46
+ end
47
+
48
+ desc "perform JOB URL", "Perform JOB with URL"
49
+ option :mock_redis, type: :boolean
50
+ option :batch, type: :string, default: SecureRandom.uuid
51
+ def perform(job, url)
52
+ load_environment
53
+ mock_redis
54
+
55
+ job = job.classify.constantize
56
+ task = Wayfarer::Task.new(url, options[:batch])
57
+ job.new(task).perform_now
58
+ end
59
+
60
+ desc "enqueue JOB URL", "Enqueue JOB with URL"
61
+ option :batch, type: :string, default: SecureRandom.uuid
62
+ def enqueue(job, url)
63
+ load_environment
64
+
65
+ job = job.classify.constantize
66
+ job.crawl(url, batch: options[:batch])
67
+ end
68
+
69
+ desc "execute JOB URL", "Execute JOB with async adapter starting from URL"
70
+ option :mock_redis, type: :boolean
71
+ option :batch, type: :string, default: SecureRandom.uuid
72
+ option :min_threads, type: :numeric, default: 1
73
+ option :max_threads, type: :numeric, default: 1
74
+ def execute(job, url)
75
+ load_environment
76
+ mock_redis
77
+
78
+ job = job.classify.constantize
79
+ job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
80
+ max_threads: options[:max_threads])
81
+ scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
82
+ executor = scheduler.instance_variable_get(:@async_executor)
83
+
84
+ job.crawl(url, batch: options[:batch])
85
+
86
+ sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
87
+
88
+ Wayfarer::Networking::Pool.instance.free
89
+ end
90
+
91
+ private
92
+
93
+ def mock_redis
94
+ Wayfarer.config[:redis][:factory] = ->(_) { MockRedis.new } if options[:mock_redis]
95
+ end
96
+
97
+ def parsed_url(url)
98
+ Addressable::URI.parse(url).normalize
99
+ end
100
+
101
+ def load_environment(require_path = options[:require])
102
+ require File.join(Dir.pwd, require_path) if require_path
103
+
104
+ load_rails
105
+ end
106
+
107
+ def load_rails
108
+ begin
109
+ require "rails/app_loader"
110
+ rescue LoadError
111
+ return
112
+ end
113
+
114
+ return unless Rails::AppLoader.find_executable
115
+
116
+ require File.expand_path("config/application", Dir.pwd)
117
+ require File.expand_path("config/boot", Dir.pwd)
118
+ require File.expand_path("config/environment", Dir.pwd)
119
+ end
120
+ end
121
+ end
data/lib/wayfarer/gc.rb CHANGED
@@ -1,14 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- GC = Struct.new(:task) do
5
- def run
6
- return unless task.counter.decrement <= 0
4
+ # TODO: Add logging
5
+ module GC
6
+ RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
7
7
 
8
- task.metadata.job.run_callbacks(:batch)
8
+ class << self
9
+ include Wayfarer::Logging.emit(gc: [:info, "Garbage collecting %<resettable>s"])
10
+ end
11
+
12
+ module_function
9
13
 
10
- task.barrier.reset!
11
- task.counter.reset!
14
+ def run(task)
15
+ RESETTABLES.each do |resettable|
16
+ log(:gc, task, resettable: resettable)
17
+ resettable.new(task).reset!
18
+ end
12
19
  end
13
20
  end
14
21
  end
@@ -1,15 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- class Handler
5
- include Wayfarer::Middleware::Controller
4
+ module Handler
5
+ extend ActiveSupport::Concern
6
6
 
7
- api Wayfarer::Middleware::Fetch
8
- api Wayfarer::Middleware::Stage
7
+ included do
8
+ include Wayfarer::Middleware::Controller
9
9
 
10
- use Wayfarer::Middleware::Router
11
- use Wayfarer::Middleware::Dispatch
10
+ use Wayfarer::Middleware::ContentType
11
+ use Wayfarer::Middleware::Router
12
+ use Wayfarer::Middleware::Dispatch
12
13
 
13
- singleton_class.undef_method :after_batch
14
+ api Wayfarer::Middleware::UserAgent
15
+ api Wayfarer::Middleware::Stage
16
+
17
+ singleton_class.undef_method :before_fetch
18
+ singleton_class.undef_method :around_fetch
19
+ singleton_class.undef_method :after_fetch
20
+ singleton_class.undef_method :after_batch
21
+ end
14
22
  end
15
23
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Logging
5
+ mattr_accessor :logger, default: ActiveSupport::Logger.new($stdout)
6
+
7
+ def self.emit(...)
8
+ Emitter.new(...)
9
+ end
10
+
11
+ class Emitter < Module
12
+ def initialize(messages)
13
+ @messages = messages
14
+
15
+ super()
16
+ end
17
+
18
+ def included(base)
19
+ messages = @messages
20
+
21
+ base.class_eval do
22
+ define_method(:log) do |key, task, **args|
23
+ level, msg = messages[key] || raise(ArgumentError, "No log message for #{key.inspect}")
24
+ severity = ActiveSupport::Logger::Severity.const_get(level.upcase)
25
+
26
+ ActiveSupport::TaggedLogging
27
+ .new(Logging.logger)
28
+ .tagged(task.batch, task.url, task[:controller]&.class&.name) do |logger|
29
+ logger.add(severity, msg % args)
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ private_constant :Emitter
37
+ end
38
+ end
@@ -3,6 +3,8 @@
3
3
  module Wayfarer
4
4
  module Middleware
5
5
  module Base
6
+ extend ActiveSupport::Concern
7
+
6
8
  API_MODULE = :API
7
9
 
8
10
  def api
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class BatchCompletion
6
+ extend Base
7
+
8
+ def call(task)
9
+ # Comparing to the initial state of `exception_executions` allows
10
+ # us to determine if an exception occurred when the job was performed,
11
+ # since the `perform.active_job` event is emitted for both successful
12
+ # and raising jobs.
13
+ task[:initial_exception_executions] ||= task[:job].exception_executions.clone
14
+
15
+ yield if block_given?
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class ContentType
6
+ extend Base
7
+
8
+ module API
9
+ extend ActiveSupport::Concern
10
+
11
+ included do
12
+ class_attribute :allowed_content_types,
13
+ default: { index: {}, patterns: Set.new },
14
+ instance_accessor: false,
15
+ instance_predicate: false
16
+ end
17
+
18
+ class_methods do
19
+ def content_type(*content_types)
20
+ content_types.each do |content_type|
21
+ case content_type
22
+ when String then allowed_content_types[:index][content_type] = true
23
+ when Regexp then allowed_content_types[:patterns] << content_type
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ def call(task)
31
+ yield if block_given? && permitted?(task)
32
+ end
33
+
34
+ private
35
+
36
+ def permitted?(task)
37
+ job = task[:controller]
38
+ content_types = job.class.allowed_content_types
39
+
40
+ return true if allows_all?(content_types)
41
+
42
+ content_type = task[:page].mime_type&.to_s || task[:page].headers["content-type"] || (return false)
43
+
44
+ content_types[:index].key?(content_type) || content_types[:patterns].any? do |pattern|
45
+ pattern.match?(content_type)
46
+ end
47
+ end
48
+
49
+ def allows_all?(content_types)
50
+ content_types[:index].empty? && content_types[:patterns].empty?
51
+ end
52
+ end
53
+ end
54
+ end
@@ -3,16 +3,20 @@
3
3
  module Wayfarer
4
4
  module Middleware
5
5
  module Controller
6
- def self.included(base)
7
- base.cattr_accessor :chain, default: Chain.empty
8
- base.attr_accessor :task
6
+ extend ActiveSupport::Concern
9
7
 
10
- base.extend(ClassMethods)
11
- base.include(InstanceMethods)
12
- base.include(Wayfarer::Callbacks)
8
+ included do
9
+ class_attribute :chain,
10
+ default: Chain.empty,
11
+ instance_accessor: false,
12
+ instance_predicate: false
13
+
14
+ attr_accessor :task
15
+
16
+ include Wayfarer::Callbacks
13
17
  end
14
18
 
15
- module ClassMethods
19
+ class_methods do
16
20
  def use(middleware)
17
21
  chain.push(middleware.lazy)
18
22
  api(middleware)
@@ -23,17 +27,17 @@ module Wayfarer
23
27
  end
24
28
  end
25
29
 
26
- module InstanceMethods
27
- def call(task)
28
- self.task = task
30
+ def call(task)
31
+ self.task = task
29
32
 
30
- task.metadata.job ||= self
31
- task.metadata.controller = self
33
+ task[:job] ||= self
34
+ task[:controller] = self
32
35
 
33
- self.class.chain.call(task) do
34
- yield if block_given?
35
- end
36
+ self.class.chain.call(task) do
37
+ yield if block_given?
36
38
  end
39
+
40
+ task[:return_value]
37
41
  end
38
42
  end
39
43
  end
@@ -5,25 +5,28 @@ module Wayfarer
5
5
  class Dedup
6
6
  extend Base
7
7
 
8
+ include Wayfarer::Logging.emit(
9
+ deduplicated: [:info, "Deduplicated URL"],
10
+ retry: [:debug, "Not deduplicating retry"],
11
+ rerouted: [:debug, "Not deduplicating rerouted task"]
12
+ )
13
+
8
14
  def call(task)
9
- # Was task routed by a previous controller already?
10
- return yield if task.metadata.action
15
+ task[:barrier] ||= Wayfarer::Redis::Barrier.new(task)
11
16
 
12
- return if task.barrier.seen?(task.url)
17
+ if task[:job].executions > 1
18
+ log(:retry, task)
19
+ return yield if block_given?
20
+ end
13
21
 
14
- begin
15
- yield if block_given?
16
- rescue StandardError => e
17
- task.barrier.unsee(task.url)
18
- raise e
22
+ if task[:job] != task[:controller]
23
+ log(:rerouted, task)
24
+ return yield if block_given?
19
25
  end
20
26
 
21
- staged_urls = task.metadata.staged_urls
22
- return if staged_urls.none?
27
+ return log(:deduplicated, task) if task[:barrier].check!(task[:normalized_url])
23
28
 
24
- inclusion = task.barrier.peek(staged_urls.to_a)
25
- unseen = staged_urls.zip(inclusion).reject { |_, seen| seen }.map(&:first)
26
- task.metadata.staged_urls = SortedSet.new(unseen)
29
+ yield if block_given?
27
30
  end
28
31
  end
29
32
  end