wayfarer 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +29 -2
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +17 -0
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -31
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -42
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -26
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
data/lib/wayfarer/callbacks.rb
CHANGED
@@ -2,69 +2,43 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Callbacks
|
5
|
-
|
6
|
-
OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
|
5
|
+
extend ActiveSupport::Concern
|
7
6
|
|
8
|
-
|
9
|
-
|
10
|
-
return if only && !applies?(only)
|
11
|
-
return if except && applies?(except)
|
7
|
+
included do
|
8
|
+
include ActiveSupport::Callbacks
|
12
9
|
|
13
|
-
|
10
|
+
define_callbacks :fetch, skip_after_callbacks_if_terminated: true
|
11
|
+
define_callbacks :action, skip_after_callbacks_if_terminated: true
|
12
|
+
define_callbacks :batch
|
13
|
+
end
|
14
14
|
|
15
|
-
|
15
|
+
class_methods do
|
16
|
+
def before_fetch(*filters, &block)
|
17
|
+
set_callback(:fetch, :before, *filters, &block)
|
16
18
|
end
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
def applies?(condition)
|
21
|
-
case condition
|
22
|
-
when Symbol then condition == action
|
23
|
-
when Enumerable then condition&.include?(action)
|
24
|
-
end
|
20
|
+
def around_fetch(*filters, &block)
|
21
|
+
set_callback(:fetch, :around, *filters, &block)
|
25
22
|
end
|
26
23
|
|
27
|
-
def
|
28
|
-
filters
|
24
|
+
def after_fetch(*filters, &block)
|
25
|
+
set_callback(:fetch, :after, *filters, &block)
|
29
26
|
end
|
30
27
|
|
31
|
-
def
|
32
|
-
filters
|
28
|
+
def before_action(*filters, &block)
|
29
|
+
set_callback(:action, :before, *filters, &block)
|
33
30
|
end
|
34
31
|
|
35
|
-
def
|
36
|
-
|
32
|
+
def around_action(*filters, &block)
|
33
|
+
set_callback(:action, :around, *filters, &block)
|
37
34
|
end
|
38
35
|
|
39
|
-
def
|
40
|
-
|
36
|
+
def after_action(*filters, &block)
|
37
|
+
set_callback(:action, :after, *filters, &block)
|
41
38
|
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.included(base)
|
45
|
-
base.include(ActiveSupport::Callbacks)
|
46
|
-
base.extend(ClassMethods)
|
47
|
-
|
48
|
-
base.class_eval do
|
49
|
-
define_callbacks(:fetch, OPTIONS)
|
50
|
-
define_callbacks(:action, OPTIONS)
|
51
|
-
define_callbacks(:batch, OPTIONS)
|
52
|
-
|
53
|
-
define(:fetch, :before)
|
54
|
-
define(:action, :before)
|
55
|
-
define(:batch, :after)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
module ClassMethods
|
60
|
-
private
|
61
39
|
|
62
|
-
def
|
63
|
-
|
64
|
-
set_callback(name, stage, **filters) do |job|
|
65
|
-
ConditionalCallback.new(job, filters).run(method, &block)
|
66
|
-
end
|
67
|
-
end
|
40
|
+
def after_batch(*filters, &block)
|
41
|
+
set_callback(:batch, :after, *filters, &block)
|
68
42
|
end
|
69
43
|
end
|
70
44
|
end
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
4
|
+
class CLI
|
5
5
|
class RoutePrinter < Thor::Shell::Color
|
6
|
-
attr_reader :url,
|
7
|
-
:path_finder,
|
8
|
-
:output
|
6
|
+
attr_reader :url, :path_finder, :output
|
9
7
|
|
10
|
-
INDENT
|
11
|
-
REGULAR_SEGMENT
|
8
|
+
INDENT = " "
|
9
|
+
REGULAR_SEGMENT = "│ "
|
12
10
|
JUNCTION_SEGMENT = "├──"
|
13
|
-
CORNER_SEGMENT
|
11
|
+
CORNER_SEGMENT = "└──"
|
14
12
|
|
15
13
|
def self.print(route, url)
|
16
14
|
route.accept(new(url))
|
@@ -24,81 +22,83 @@ module Wayfarer
|
|
24
22
|
|
25
23
|
def visit(route)
|
26
24
|
route.accept(path_finder) unless route.parent
|
27
|
-
|
28
|
-
|
29
|
-
puts [segments(route), label(route)].join("")[3..]
|
25
|
+
puts format_route_output(route)
|
30
26
|
true
|
31
27
|
end
|
32
28
|
|
29
|
+
private
|
30
|
+
|
31
|
+
def format_route_output(route)
|
32
|
+
[segments(route), route_description(route)].join[3..]
|
33
|
+
end
|
34
|
+
|
33
35
|
def segments(route)
|
34
|
-
|
35
|
-
parents = parents(route).map { |parent| parent_segment(parent) }
|
36
|
-
[parents, current].join
|
36
|
+
[parents(route).map { |parent| parent_segment(parent) }, segment(route)].join
|
37
37
|
end
|
38
38
|
|
39
39
|
def parent_segment(parent)
|
40
|
-
|
41
|
-
INDENT
|
42
|
-
else
|
43
|
-
REGULAR_SEGMENT
|
44
|
-
end
|
40
|
+
trailer?(parent) ? INDENT : REGULAR_SEGMENT
|
45
41
|
end
|
46
42
|
|
47
43
|
def segment(route)
|
48
|
-
|
49
|
-
CORNER_SEGMENT
|
50
|
-
else
|
51
|
-
JUNCTION_SEGMENT
|
52
|
-
end
|
44
|
+
trailer?(route) ? CORNER_SEGMENT : JUNCTION_SEGMENT
|
53
45
|
end
|
54
46
|
|
55
|
-
def
|
56
|
-
[
|
57
|
-
|
58
|
-
|
47
|
+
def route_description(route)
|
48
|
+
attrs = [route_arg(route), routing_result(route), route_action(route), route_params(route)].compact
|
49
|
+
text = attrs.any? ? "#{matcher_name(route)}(#{attrs.join(', ')})" : matcher_name(route)
|
50
|
+
set_color(text, *route_colors(route))
|
59
51
|
end
|
60
52
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
53
|
+
def matcher_name(route)
|
54
|
+
case route
|
55
|
+
when Wayfarer::Routing::TargetRoute
|
56
|
+
"Target"
|
57
|
+
when Wayfarer::Routing::RootRoute
|
58
|
+
Wayfarer::Routing::PathFinder.result(route, url).class.name.demodulize
|
66
59
|
else
|
67
|
-
|
60
|
+
route.matcher.class.name.demodulize
|
68
61
|
end
|
69
62
|
end
|
70
63
|
|
71
|
-
def
|
72
|
-
return
|
64
|
+
def routing_result(route)
|
65
|
+
return if route.is_a?(Wayfarer::Routing::RootRoute)
|
73
66
|
|
74
|
-
|
67
|
+
"match: #{route.matcher.match(url)}"
|
75
68
|
end
|
76
69
|
|
77
|
-
def
|
78
|
-
return
|
70
|
+
def route_action(route)
|
71
|
+
return unless route.is_a?(Wayfarer::Routing::RootRoute)
|
79
72
|
|
80
|
-
route
|
73
|
+
result = Wayfarer::Routing::PathFinder.result(route, url)
|
74
|
+
result.action.inspect if result.is_a?(Wayfarer::Routing::Result::Match)
|
81
75
|
end
|
82
76
|
|
83
|
-
def
|
84
|
-
return
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
77
|
+
def route_arg(route)
|
78
|
+
return if route.is_a?(Wayfarer::Routing::RootRoute) || route.is_a?(Wayfarer::Routing::TargetRoute)
|
79
|
+
|
80
|
+
matcher = route.matcher
|
81
|
+
matcher_opts = case matcher
|
82
|
+
when Wayfarer::Routing::Matchers::Host then matcher.host
|
83
|
+
when Wayfarer::Routing::Matchers::Path then matcher.path
|
84
|
+
when Wayfarer::Routing::Matchers::Query then matcher.fields
|
85
|
+
when Wayfarer::Routing::Matchers::Custom then route.action.to_s
|
86
|
+
when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
|
87
|
+
when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
|
88
|
+
end
|
89
|
+
matcher_opts.inspect
|
94
90
|
end
|
95
91
|
|
96
|
-
def
|
97
|
-
params = route.
|
98
|
-
|
99
|
-
|
92
|
+
def route_params(route)
|
93
|
+
params = if route.is_a?(Wayfarer::Routing::RootRoute)
|
94
|
+
result = Wayfarer::Routing::PathFinder.result(route, url)
|
95
|
+
result.params if result.is_a?(Wayfarer::Routing::Result::Match)
|
96
|
+
else
|
97
|
+
route.matcher.params(url)
|
98
|
+
end
|
100
99
|
|
101
|
-
|
100
|
+
"params: #{params.symbolize_keys}" if params&.any?
|
101
|
+
end
|
102
102
|
|
103
103
|
def parents(route, current = [])
|
104
104
|
return current unless route.parent
|
@@ -107,9 +107,23 @@ module Wayfarer
|
|
107
107
|
end
|
108
108
|
|
109
109
|
def trailer?(route)
|
110
|
-
|
110
|
+
!route.parent || route.parent.children.last == route
|
111
|
+
end
|
112
|
+
|
113
|
+
def route_colors(route)
|
114
|
+
if path_finder.path.include?(route)
|
115
|
+
%i[green bold]
|
116
|
+
elsif route.matcher.match(url)
|
117
|
+
%i[green]
|
118
|
+
else
|
119
|
+
%i[red]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def set_color(string, *colors)
|
124
|
+
return string if ENV.key?("NO_COLOR")
|
111
125
|
|
112
|
-
|
126
|
+
super(string, *colors)
|
113
127
|
end
|
114
128
|
end
|
115
129
|
end
|
data/lib/wayfarer/cli.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
class CLI < Thor
|
5
|
+
def self.exit_on_failure?
|
6
|
+
true
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "version", "Print version"
|
10
|
+
def version
|
11
|
+
say Wayfarer::VERSION::STRING
|
12
|
+
end
|
13
|
+
|
14
|
+
class_option :require, aliases: :r, type: :string, default: nil
|
15
|
+
|
16
|
+
desc "route JOB URL", "Routing result of URL for JOB"
|
17
|
+
def route(job, url)
|
18
|
+
load_environment
|
19
|
+
|
20
|
+
url = parsed_url(url)
|
21
|
+
job = job.classify.constantize
|
22
|
+
route = job.route
|
23
|
+
route.invoke(url)
|
24
|
+
|
25
|
+
result = Wayfarer::Routing::PathFinder.result(route, url)
|
26
|
+
result_type = result.class.name.demodulize
|
27
|
+
|
28
|
+
say case result
|
29
|
+
when Wayfarer::Routing::Result::Match
|
30
|
+
"#{result_type} => #{result.action.inspect}"
|
31
|
+
else
|
32
|
+
result_type
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
desc "tree JOB URL", "Visualize JOB's routing tree for URL"
|
37
|
+
def tree(job, url)
|
38
|
+
load_environment
|
39
|
+
|
40
|
+
url = parsed_url(url)
|
41
|
+
job = job.classify.constantize
|
42
|
+
route = job.route
|
43
|
+
route.invoke(url)
|
44
|
+
|
45
|
+
Wayfarer::CLI::RoutePrinter.print(route, url)
|
46
|
+
end
|
47
|
+
|
48
|
+
desc "perform JOB URL", "Perform JOB with URL"
|
49
|
+
option :mock_redis, type: :boolean
|
50
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
51
|
+
def perform(job, url)
|
52
|
+
load_environment
|
53
|
+
mock_redis
|
54
|
+
|
55
|
+
job = job.classify.constantize
|
56
|
+
task = Wayfarer::Task.new(url, options[:batch])
|
57
|
+
job.new(task).perform_now
|
58
|
+
end
|
59
|
+
|
60
|
+
desc "enqueue JOB URL", "Enqueue JOB with URL"
|
61
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
62
|
+
def enqueue(job, url)
|
63
|
+
load_environment
|
64
|
+
|
65
|
+
job = job.classify.constantize
|
66
|
+
job.crawl(url, batch: options[:batch])
|
67
|
+
end
|
68
|
+
|
69
|
+
desc "execute JOB URL", "Execute JOB with async adapter starting from URL"
|
70
|
+
option :mock_redis, type: :boolean
|
71
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
72
|
+
option :min_threads, type: :numeric, default: 1
|
73
|
+
option :max_threads, type: :numeric, default: 1
|
74
|
+
def execute(job, url)
|
75
|
+
load_environment
|
76
|
+
mock_redis
|
77
|
+
|
78
|
+
job = job.classify.constantize
|
79
|
+
job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
|
80
|
+
max_threads: options[:max_threads])
|
81
|
+
scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
|
82
|
+
executor = scheduler.instance_variable_get(:@async_executor)
|
83
|
+
|
84
|
+
job.crawl(url, batch: options[:batch])
|
85
|
+
|
86
|
+
sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
|
87
|
+
|
88
|
+
Wayfarer::Networking::Pool.instance.free
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def mock_redis
|
94
|
+
Wayfarer.config[:redis][:factory] = ->(_) { MockRedis.new } if options[:mock_redis]
|
95
|
+
end
|
96
|
+
|
97
|
+
def parsed_url(url)
|
98
|
+
Addressable::URI.parse(url).normalize
|
99
|
+
end
|
100
|
+
|
101
|
+
def load_environment(require_path = options[:require])
|
102
|
+
require File.join(Dir.pwd, require_path) if require_path
|
103
|
+
|
104
|
+
load_rails
|
105
|
+
end
|
106
|
+
|
107
|
+
def load_rails
|
108
|
+
begin
|
109
|
+
require "rails/app_loader"
|
110
|
+
rescue LoadError
|
111
|
+
return
|
112
|
+
end
|
113
|
+
|
114
|
+
return unless Rails::AppLoader.find_executable
|
115
|
+
|
116
|
+
require File.expand_path("config/application", Dir.pwd)
|
117
|
+
require File.expand_path("config/boot", Dir.pwd)
|
118
|
+
require File.expand_path("config/environment", Dir.pwd)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
data/lib/wayfarer/gc.rb
CHANGED
@@ -1,14 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
# TODO: Add logging
|
5
|
+
module GC
|
6
|
+
RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
|
7
7
|
|
8
|
-
|
8
|
+
class << self
|
9
|
+
include Wayfarer::Logging.emit(gc: [:info, "Garbage collecting %<resettable>s"])
|
10
|
+
end
|
11
|
+
|
12
|
+
module_function
|
9
13
|
|
10
|
-
|
11
|
-
|
14
|
+
def run(task)
|
15
|
+
RESETTABLES.each do |resettable|
|
16
|
+
log(:gc, task, resettable: resettable)
|
17
|
+
resettable.new(task).reset!
|
18
|
+
end
|
12
19
|
end
|
13
20
|
end
|
14
21
|
end
|
data/lib/wayfarer/handler.rb
CHANGED
@@ -1,15 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
4
|
+
module Handler
|
5
|
+
extend ActiveSupport::Concern
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
included do
|
8
|
+
include Wayfarer::Middleware::Controller
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
use Wayfarer::Middleware::ContentType
|
11
|
+
use Wayfarer::Middleware::Router
|
12
|
+
use Wayfarer::Middleware::Dispatch
|
12
13
|
|
13
|
-
|
14
|
+
api Wayfarer::Middleware::UserAgent
|
15
|
+
api Wayfarer::Middleware::Stage
|
16
|
+
|
17
|
+
singleton_class.undef_method :before_fetch
|
18
|
+
singleton_class.undef_method :around_fetch
|
19
|
+
singleton_class.undef_method :after_fetch
|
20
|
+
singleton_class.undef_method :after_batch
|
21
|
+
end
|
14
22
|
end
|
15
23
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Logging
|
5
|
+
mattr_accessor :logger, default: ActiveSupport::Logger.new($stdout)
|
6
|
+
|
7
|
+
def self.emit(...)
|
8
|
+
Emitter.new(...)
|
9
|
+
end
|
10
|
+
|
11
|
+
class Emitter < Module
|
12
|
+
def initialize(messages)
|
13
|
+
@messages = messages
|
14
|
+
|
15
|
+
super()
|
16
|
+
end
|
17
|
+
|
18
|
+
def included(base)
|
19
|
+
messages = @messages
|
20
|
+
|
21
|
+
base.class_eval do
|
22
|
+
define_method(:log) do |key, task, **args|
|
23
|
+
level, msg = messages[key] || raise(ArgumentError, "No log message for #{key.inspect}")
|
24
|
+
severity = ActiveSupport::Logger::Severity.const_get(level.upcase)
|
25
|
+
|
26
|
+
ActiveSupport::TaggedLogging
|
27
|
+
.new(Logging.logger)
|
28
|
+
.tagged(task.batch, task.url, task[:controller]&.class&.name) do |logger|
|
29
|
+
logger.add(severity, msg % args)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private_constant :Emitter
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class BatchCompletion
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
def call(task)
|
9
|
+
# Comparing to the initial state of `exception_executions` allows
|
10
|
+
# us to determine if an exception occurred when the job was performed,
|
11
|
+
# since the `perform.active_job` event is emitted for both successful
|
12
|
+
# and raising jobs.
|
13
|
+
task[:initial_exception_executions] ||= task[:job].exception_executions.clone
|
14
|
+
|
15
|
+
yield if block_given?
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class ContentType
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
module API
|
9
|
+
extend ActiveSupport::Concern
|
10
|
+
|
11
|
+
included do
|
12
|
+
class_attribute :allowed_content_types,
|
13
|
+
default: { index: {}, patterns: Set.new },
|
14
|
+
instance_accessor: false,
|
15
|
+
instance_predicate: false
|
16
|
+
end
|
17
|
+
|
18
|
+
class_methods do
|
19
|
+
def content_type(*content_types)
|
20
|
+
content_types.each do |content_type|
|
21
|
+
case content_type
|
22
|
+
when String then allowed_content_types[:index][content_type] = true
|
23
|
+
when Regexp then allowed_content_types[:patterns] << content_type
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def call(task)
|
31
|
+
yield if block_given? && permitted?(task)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def permitted?(task)
|
37
|
+
job = task[:controller]
|
38
|
+
content_types = job.class.allowed_content_types
|
39
|
+
|
40
|
+
return true if allows_all?(content_types)
|
41
|
+
|
42
|
+
content_type = task[:page].mime_type&.to_s || task[:page].headers["content-type"] || (return false)
|
43
|
+
|
44
|
+
content_types[:index].key?(content_type) || content_types[:patterns].any? do |pattern|
|
45
|
+
pattern.match?(content_type)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def allows_all?(content_types)
|
50
|
+
content_types[:index].empty? && content_types[:patterns].empty?
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -3,16 +3,20 @@
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
5
|
module Controller
|
6
|
-
|
7
|
-
base.cattr_accessor :chain, default: Chain.empty
|
8
|
-
base.attr_accessor :task
|
6
|
+
extend ActiveSupport::Concern
|
9
7
|
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
included do
|
9
|
+
class_attribute :chain,
|
10
|
+
default: Chain.empty,
|
11
|
+
instance_accessor: false,
|
12
|
+
instance_predicate: false
|
13
|
+
|
14
|
+
attr_accessor :task
|
15
|
+
|
16
|
+
include Wayfarer::Callbacks
|
13
17
|
end
|
14
18
|
|
15
|
-
|
19
|
+
class_methods do
|
16
20
|
def use(middleware)
|
17
21
|
chain.push(middleware.lazy)
|
18
22
|
api(middleware)
|
@@ -23,17 +27,17 @@ module Wayfarer
|
|
23
27
|
end
|
24
28
|
end
|
25
29
|
|
26
|
-
|
27
|
-
|
28
|
-
self.task = task
|
30
|
+
def call(task)
|
31
|
+
self.task = task
|
29
32
|
|
30
|
-
|
31
|
-
|
33
|
+
task[:job] ||= self
|
34
|
+
task[:controller] = self
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
+
self.class.chain.call(task) do
|
37
|
+
yield if block_given?
|
36
38
|
end
|
39
|
+
|
40
|
+
task[:return_value]
|
37
41
|
end
|
38
42
|
end
|
39
43
|
end
|
@@ -5,25 +5,28 @@ module Wayfarer
|
|
5
5
|
class Dedup
|
6
6
|
extend Base
|
7
7
|
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
deduplicated: [:info, "Deduplicated URL"],
|
10
|
+
retry: [:debug, "Not deduplicating retry"],
|
11
|
+
rerouted: [:debug, "Not deduplicating rerouted task"]
|
12
|
+
)
|
13
|
+
|
8
14
|
def call(task)
|
9
|
-
|
10
|
-
return yield if task.metadata.action
|
15
|
+
task[:barrier] ||= Wayfarer::Redis::Barrier.new(task)
|
11
16
|
|
12
|
-
|
17
|
+
if task[:job].executions > 1
|
18
|
+
log(:retry, task)
|
19
|
+
return yield if block_given?
|
20
|
+
end
|
13
21
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
task.barrier.unsee(task.url)
|
18
|
-
raise e
|
22
|
+
if task[:job] != task[:controller]
|
23
|
+
log(:rerouted, task)
|
24
|
+
return yield if block_given?
|
19
25
|
end
|
20
26
|
|
21
|
-
|
22
|
-
return if staged_urls.none?
|
27
|
+
return log(:deduplicated, task) if task[:barrier].check!(task[:normalized_url])
|
23
28
|
|
24
|
-
|
25
|
-
unseen = staged_urls.zip(inclusion).reject { |_, seen| seen }.map(&:first)
|
26
|
-
task.metadata.staged_urls = SortedSet.new(unseen)
|
29
|
+
yield if block_given?
|
27
30
|
end
|
28
31
|
end
|
29
32
|
end
|