wayfarer 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +28 -1
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +1 -1
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
data/lib/wayfarer/callbacks.rb
CHANGED
@@ -2,69 +2,43 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Callbacks
|
5
|
-
|
6
|
-
OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
|
5
|
+
extend ActiveSupport::Concern
|
7
6
|
|
8
|
-
|
9
|
-
|
10
|
-
return if only && !applies?(only)
|
11
|
-
return if except && applies?(except)
|
7
|
+
included do
|
8
|
+
include ActiveSupport::Callbacks
|
12
9
|
|
13
|
-
|
10
|
+
define_callbacks :fetch, skip_after_callbacks_if_terminated: true
|
11
|
+
define_callbacks :action, skip_after_callbacks_if_terminated: true
|
12
|
+
define_callbacks :batch
|
13
|
+
end
|
14
14
|
|
15
|
-
|
15
|
+
class_methods do
|
16
|
+
def before_fetch(*filters, &block)
|
17
|
+
set_callback(:fetch, :before, *filters, &block)
|
16
18
|
end
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
def applies?(condition)
|
21
|
-
case condition
|
22
|
-
when Symbol then condition == action
|
23
|
-
when Enumerable then condition&.include?(action)
|
24
|
-
end
|
20
|
+
def around_fetch(*filters, &block)
|
21
|
+
set_callback(:fetch, :around, *filters, &block)
|
25
22
|
end
|
26
23
|
|
27
|
-
def
|
28
|
-
filters
|
24
|
+
def after_fetch(*filters, &block)
|
25
|
+
set_callback(:fetch, :after, *filters, &block)
|
29
26
|
end
|
30
27
|
|
31
|
-
def
|
32
|
-
filters
|
28
|
+
def before_action(*filters, &block)
|
29
|
+
set_callback(:action, :before, *filters, &block)
|
33
30
|
end
|
34
31
|
|
35
|
-
def
|
36
|
-
|
32
|
+
def around_action(*filters, &block)
|
33
|
+
set_callback(:action, :around, *filters, &block)
|
37
34
|
end
|
38
35
|
|
39
|
-
def
|
40
|
-
|
36
|
+
def after_action(*filters, &block)
|
37
|
+
set_callback(:action, :after, *filters, &block)
|
41
38
|
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.included(base)
|
45
|
-
base.include(ActiveSupport::Callbacks)
|
46
|
-
base.extend(ClassMethods)
|
47
|
-
|
48
|
-
base.class_eval do
|
49
|
-
define_callbacks(:fetch, OPTIONS)
|
50
|
-
define_callbacks(:action, OPTIONS)
|
51
|
-
define_callbacks(:batch, OPTIONS)
|
52
|
-
|
53
|
-
define(:fetch, :before)
|
54
|
-
define(:action, :before)
|
55
|
-
define(:batch, :after)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
module ClassMethods
|
60
|
-
private
|
61
39
|
|
62
|
-
def
|
63
|
-
|
64
|
-
set_callback(name, stage, **filters) do |job|
|
65
|
-
ConditionalCallback.new(job, filters).run(method, &block)
|
66
|
-
end
|
67
|
-
end
|
40
|
+
def after_batch(*filters, &block)
|
41
|
+
set_callback(:batch, :after, *filters, &block)
|
68
42
|
end
|
69
43
|
end
|
70
44
|
end
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
4
|
+
class CLI
|
5
5
|
class RoutePrinter < Thor::Shell::Color
|
6
|
-
attr_reader :url,
|
7
|
-
:path_finder,
|
8
|
-
:output
|
6
|
+
attr_reader :url, :path_finder, :output
|
9
7
|
|
10
|
-
INDENT
|
11
|
-
REGULAR_SEGMENT
|
8
|
+
INDENT = " "
|
9
|
+
REGULAR_SEGMENT = "│ "
|
12
10
|
JUNCTION_SEGMENT = "├──"
|
13
|
-
CORNER_SEGMENT
|
11
|
+
CORNER_SEGMENT = "└──"
|
14
12
|
|
15
13
|
def self.print(route, url)
|
16
14
|
route.accept(new(url))
|
@@ -24,81 +22,83 @@ module Wayfarer
|
|
24
22
|
|
25
23
|
def visit(route)
|
26
24
|
route.accept(path_finder) unless route.parent
|
27
|
-
|
28
|
-
|
29
|
-
puts [segments(route), label(route)].join("")[3..]
|
25
|
+
puts format_route_output(route)
|
30
26
|
true
|
31
27
|
end
|
32
28
|
|
29
|
+
private
|
30
|
+
|
31
|
+
def format_route_output(route)
|
32
|
+
[segments(route), route_description(route)].join[3..]
|
33
|
+
end
|
34
|
+
|
33
35
|
def segments(route)
|
34
|
-
|
35
|
-
parents = parents(route).map { |parent| parent_segment(parent) }
|
36
|
-
[parents, current].join
|
36
|
+
[parents(route).map { |parent| parent_segment(parent) }, segment(route)].join
|
37
37
|
end
|
38
38
|
|
39
39
|
def parent_segment(parent)
|
40
|
-
|
41
|
-
INDENT
|
42
|
-
else
|
43
|
-
REGULAR_SEGMENT
|
44
|
-
end
|
40
|
+
trailer?(parent) ? INDENT : REGULAR_SEGMENT
|
45
41
|
end
|
46
42
|
|
47
43
|
def segment(route)
|
48
|
-
|
49
|
-
CORNER_SEGMENT
|
50
|
-
else
|
51
|
-
JUNCTION_SEGMENT
|
52
|
-
end
|
44
|
+
trailer?(route) ? CORNER_SEGMENT : JUNCTION_SEGMENT
|
53
45
|
end
|
54
46
|
|
55
|
-
def
|
56
|
-
[
|
57
|
-
|
58
|
-
|
47
|
+
def route_description(route)
|
48
|
+
attrs = [route_arg(route), routing_result(route), route_action(route), route_params(route)].compact
|
49
|
+
text = attrs.any? ? "#{matcher_name(route)}(#{attrs.join(', ')})" : matcher_name(route)
|
50
|
+
set_color(text, *route_colors(route))
|
59
51
|
end
|
60
52
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
53
|
+
def matcher_name(route)
|
54
|
+
case route
|
55
|
+
when Wayfarer::Routing::TargetRoute
|
56
|
+
"Target"
|
57
|
+
when Wayfarer::Routing::RootRoute
|
58
|
+
Wayfarer::Routing::PathFinder.result(route, url).class.name.demodulize
|
66
59
|
else
|
67
|
-
|
60
|
+
route.matcher.class.name.demodulize
|
68
61
|
end
|
69
62
|
end
|
70
63
|
|
71
|
-
def
|
72
|
-
return
|
64
|
+
def routing_result(route)
|
65
|
+
return if route.is_a?(Wayfarer::Routing::RootRoute)
|
73
66
|
|
74
|
-
|
67
|
+
"match: #{route.matcher.match(url)}"
|
75
68
|
end
|
76
69
|
|
77
|
-
def
|
78
|
-
return
|
70
|
+
def route_action(route)
|
71
|
+
return unless route.is_a?(Wayfarer::Routing::RootRoute)
|
79
72
|
|
80
|
-
route
|
73
|
+
result = Wayfarer::Routing::PathFinder.result(route, url)
|
74
|
+
result.action.inspect if result.is_a?(Wayfarer::Routing::Result::Match)
|
81
75
|
end
|
82
76
|
|
83
|
-
def
|
84
|
-
return
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
77
|
+
def route_arg(route)
|
78
|
+
return if route.is_a?(Wayfarer::Routing::RootRoute) || route.is_a?(Wayfarer::Routing::TargetRoute)
|
79
|
+
|
80
|
+
matcher = route.matcher
|
81
|
+
matcher_opts = case matcher
|
82
|
+
when Wayfarer::Routing::Matchers::Host then matcher.host
|
83
|
+
when Wayfarer::Routing::Matchers::Path then matcher.path
|
84
|
+
when Wayfarer::Routing::Matchers::Query then matcher.fields
|
85
|
+
when Wayfarer::Routing::Matchers::Custom then route.action.to_s
|
86
|
+
when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
|
87
|
+
when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
|
88
|
+
end
|
89
|
+
matcher_opts.inspect
|
94
90
|
end
|
95
91
|
|
96
|
-
def
|
97
|
-
params = route.
|
98
|
-
|
99
|
-
|
92
|
+
def route_params(route)
|
93
|
+
params = if route.is_a?(Wayfarer::Routing::RootRoute)
|
94
|
+
result = Wayfarer::Routing::PathFinder.result(route, url)
|
95
|
+
result.params if result.is_a?(Wayfarer::Routing::Result::Match)
|
96
|
+
else
|
97
|
+
route.matcher.params(url)
|
98
|
+
end
|
100
99
|
|
101
|
-
|
100
|
+
"params: #{params.symbolize_keys}" if params&.any?
|
101
|
+
end
|
102
102
|
|
103
103
|
def parents(route, current = [])
|
104
104
|
return current unless route.parent
|
@@ -107,9 +107,23 @@ module Wayfarer
|
|
107
107
|
end
|
108
108
|
|
109
109
|
def trailer?(route)
|
110
|
-
|
110
|
+
!route.parent || route.parent.children.last == route
|
111
|
+
end
|
112
|
+
|
113
|
+
def route_colors(route)
|
114
|
+
if path_finder.path.include?(route)
|
115
|
+
%i[green bold]
|
116
|
+
elsif route.matcher.match(url)
|
117
|
+
%i[green]
|
118
|
+
else
|
119
|
+
%i[red]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def set_color(string, *colors)
|
124
|
+
return string if ENV.key?("NO_COLOR")
|
111
125
|
|
112
|
-
|
126
|
+
super(string, *colors)
|
113
127
|
end
|
114
128
|
end
|
115
129
|
end
|
data/lib/wayfarer/cli.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
class CLI < Thor
|
5
|
+
def self.exit_on_failure?
|
6
|
+
true
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "version", "Print version"
|
10
|
+
def version
|
11
|
+
say Wayfarer::VERSION::STRING
|
12
|
+
end
|
13
|
+
|
14
|
+
class_option :require, aliases: :r, type: :string, default: nil
|
15
|
+
|
16
|
+
desc "route JOB URL", "Routing result of URL for JOB"
|
17
|
+
def route(job, url)
|
18
|
+
load_environment
|
19
|
+
|
20
|
+
url = parsed_url(url)
|
21
|
+
job = job.classify.constantize
|
22
|
+
route = job.route
|
23
|
+
route.invoke(url)
|
24
|
+
|
25
|
+
result = Wayfarer::Routing::PathFinder.result(route, url)
|
26
|
+
result_type = result.class.name.demodulize
|
27
|
+
|
28
|
+
say case result
|
29
|
+
when Wayfarer::Routing::Result::Match
|
30
|
+
"#{result_type} => #{result.action.inspect}"
|
31
|
+
else
|
32
|
+
result_type
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
desc "tree JOB URL", "Visualize JOB's routing tree for URL"
|
37
|
+
def tree(job, url)
|
38
|
+
load_environment
|
39
|
+
|
40
|
+
url = parsed_url(url)
|
41
|
+
job = job.classify.constantize
|
42
|
+
route = job.route
|
43
|
+
route.invoke(url)
|
44
|
+
|
45
|
+
Wayfarer::CLI::RoutePrinter.print(route, url)
|
46
|
+
end
|
47
|
+
|
48
|
+
desc "perform JOB URL", "Perform JOB with URL"
|
49
|
+
option :mock_redis, type: :boolean
|
50
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
51
|
+
def perform(job, url)
|
52
|
+
load_environment
|
53
|
+
mock_redis
|
54
|
+
|
55
|
+
job = job.classify.constantize
|
56
|
+
task = Wayfarer::Task.new(url, options[:batch])
|
57
|
+
job.new(task).perform_now
|
58
|
+
end
|
59
|
+
|
60
|
+
desc "enqueue JOB URL", "Enqueue JOB with URL"
|
61
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
62
|
+
def enqueue(job, url)
|
63
|
+
load_environment
|
64
|
+
|
65
|
+
job = job.classify.constantize
|
66
|
+
job.crawl(url, batch: options[:batch])
|
67
|
+
end
|
68
|
+
|
69
|
+
desc "execute JOB URL", "Execute JOB with async adapter starting from URL"
|
70
|
+
option :mock_redis, type: :boolean
|
71
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
72
|
+
option :min_threads, type: :numeric, default: 1
|
73
|
+
option :max_threads, type: :numeric, default: 1
|
74
|
+
def execute(job, url)
|
75
|
+
load_environment
|
76
|
+
mock_redis
|
77
|
+
|
78
|
+
job = job.classify.constantize
|
79
|
+
job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
|
80
|
+
max_threads: options[:max_threads])
|
81
|
+
scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
|
82
|
+
executor = scheduler.instance_variable_get(:@async_executor)
|
83
|
+
|
84
|
+
job.crawl(url, batch: options[:batch])
|
85
|
+
|
86
|
+
sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
|
87
|
+
|
88
|
+
Wayfarer::Networking::Pool.instance.free
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def mock_redis
|
94
|
+
Wayfarer.config[:redis][:factory] = ->(_) { MockRedis.new } if options[:mock_redis]
|
95
|
+
end
|
96
|
+
|
97
|
+
def parsed_url(url)
|
98
|
+
Addressable::URI.parse(url).normalize
|
99
|
+
end
|
100
|
+
|
101
|
+
def load_environment(require_path = options[:require])
|
102
|
+
require File.join(Dir.pwd, require_path) if require_path
|
103
|
+
|
104
|
+
load_rails
|
105
|
+
end
|
106
|
+
|
107
|
+
def load_rails
|
108
|
+
begin
|
109
|
+
require "rails/app_loader"
|
110
|
+
rescue LoadError
|
111
|
+
return
|
112
|
+
end
|
113
|
+
|
114
|
+
return unless Rails::AppLoader.find_executable
|
115
|
+
|
116
|
+
require File.expand_path("config/application", Dir.pwd)
|
117
|
+
require File.expand_path("config/boot", Dir.pwd)
|
118
|
+
require File.expand_path("config/environment", Dir.pwd)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
data/lib/wayfarer/gc.rb
CHANGED
@@ -1,14 +1,21 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
# TODO: Add logging
|
5
|
+
module GC
|
6
|
+
RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
|
7
7
|
|
8
|
-
|
8
|
+
class << self
|
9
|
+
include Wayfarer::Logging.emit(gc: [:info, "Garbage collecting %<resettable>s"])
|
10
|
+
end
|
11
|
+
|
12
|
+
module_function
|
9
13
|
|
10
|
-
|
11
|
-
|
14
|
+
def run(task)
|
15
|
+
RESETTABLES.each do |resettable|
|
16
|
+
log(:gc, task, resettable: resettable)
|
17
|
+
resettable.new(task).reset!
|
18
|
+
end
|
12
19
|
end
|
13
20
|
end
|
14
21
|
end
|
data/lib/wayfarer/handler.rb
CHANGED
@@ -1,15 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
4
|
+
module Handler
|
5
|
+
extend ActiveSupport::Concern
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
included do
|
8
|
+
include Wayfarer::Middleware::Controller
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
use Wayfarer::Middleware::ContentType
|
11
|
+
use Wayfarer::Middleware::Router
|
12
|
+
use Wayfarer::Middleware::Dispatch
|
12
13
|
|
13
|
-
|
14
|
+
api Wayfarer::Middleware::UserAgent
|
15
|
+
api Wayfarer::Middleware::Stage
|
16
|
+
|
17
|
+
singleton_class.undef_method :before_fetch
|
18
|
+
singleton_class.undef_method :around_fetch
|
19
|
+
singleton_class.undef_method :after_fetch
|
20
|
+
singleton_class.undef_method :after_batch
|
21
|
+
end
|
14
22
|
end
|
15
23
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Logging
|
5
|
+
mattr_accessor :logger, default: ActiveSupport::Logger.new($stdout)
|
6
|
+
|
7
|
+
def self.emit(...)
|
8
|
+
Emitter.new(...)
|
9
|
+
end
|
10
|
+
|
11
|
+
class Emitter < Module
|
12
|
+
def initialize(messages)
|
13
|
+
@messages = messages
|
14
|
+
|
15
|
+
super()
|
16
|
+
end
|
17
|
+
|
18
|
+
def included(base)
|
19
|
+
messages = @messages
|
20
|
+
|
21
|
+
base.class_eval do
|
22
|
+
define_method(:log) do |key, task, **args|
|
23
|
+
level, msg = messages[key] || raise(ArgumentError, "No log message for #{key.inspect}")
|
24
|
+
severity = ActiveSupport::Logger::Severity.const_get(level.upcase)
|
25
|
+
|
26
|
+
ActiveSupport::TaggedLogging
|
27
|
+
.new(Logging.logger)
|
28
|
+
.tagged(task.batch, task.url, task[:controller]&.class&.name) do |logger|
|
29
|
+
logger.add(severity, msg % args)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private_constant :Emitter
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class BatchCompletion
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
def call(task)
|
9
|
+
# Comparing to the initial state of `exception_executions` allows
|
10
|
+
# us to determine if an exception occurred when the job was performed,
|
11
|
+
# since the `perform.active_job` event is emitted for both successful
|
12
|
+
# and raising jobs.
|
13
|
+
task[:initial_exception_executions] ||= task[:job].exception_executions.clone
|
14
|
+
|
15
|
+
yield if block_given?
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class ContentType
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
module API
|
9
|
+
extend ActiveSupport::Concern
|
10
|
+
|
11
|
+
included do
|
12
|
+
class_attribute :allowed_content_types,
|
13
|
+
default: { index: {}, patterns: Set.new },
|
14
|
+
instance_accessor: false,
|
15
|
+
instance_predicate: false
|
16
|
+
end
|
17
|
+
|
18
|
+
class_methods do
|
19
|
+
def content_type(*content_types)
|
20
|
+
content_types.each do |content_type|
|
21
|
+
case content_type
|
22
|
+
when String then allowed_content_types[:index][content_type] = true
|
23
|
+
when Regexp then allowed_content_types[:patterns] << content_type
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def call(task)
|
31
|
+
yield if block_given? && permitted?(task)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def permitted?(task)
|
37
|
+
job = task[:controller]
|
38
|
+
content_types = job.class.allowed_content_types
|
39
|
+
|
40
|
+
return true if allows_all?(content_types)
|
41
|
+
|
42
|
+
content_type = task[:page].mime_type&.to_s || task[:page].headers["content-type"] || (return false)
|
43
|
+
|
44
|
+
content_types[:index].key?(content_type) || content_types[:patterns].any? do |pattern|
|
45
|
+
pattern.match?(content_type)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def allows_all?(content_types)
|
50
|
+
content_types[:index].empty? && content_types[:patterns].empty?
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -3,16 +3,20 @@
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
5
|
module Controller
|
6
|
-
|
7
|
-
base.cattr_accessor :chain, default: Chain.empty
|
8
|
-
base.attr_accessor :task
|
6
|
+
extend ActiveSupport::Concern
|
9
7
|
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
included do
|
9
|
+
class_attribute :chain,
|
10
|
+
default: Chain.empty,
|
11
|
+
instance_accessor: false,
|
12
|
+
instance_predicate: false
|
13
|
+
|
14
|
+
attr_accessor :task
|
15
|
+
|
16
|
+
include Wayfarer::Callbacks
|
13
17
|
end
|
14
18
|
|
15
|
-
|
19
|
+
class_methods do
|
16
20
|
def use(middleware)
|
17
21
|
chain.push(middleware.lazy)
|
18
22
|
api(middleware)
|
@@ -23,17 +27,17 @@ module Wayfarer
|
|
23
27
|
end
|
24
28
|
end
|
25
29
|
|
26
|
-
|
27
|
-
|
28
|
-
self.task = task
|
30
|
+
def call(task)
|
31
|
+
self.task = task
|
29
32
|
|
30
|
-
|
31
|
-
|
33
|
+
task[:job] ||= self
|
34
|
+
task[:controller] = self
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
+
self.class.chain.call(task) do
|
37
|
+
yield if block_given?
|
36
38
|
end
|
39
|
+
|
40
|
+
task[:return_value]
|
37
41
|
end
|
38
42
|
end
|
39
43
|
end
|
@@ -5,25 +5,28 @@ module Wayfarer
|
|
5
5
|
class Dedup
|
6
6
|
extend Base
|
7
7
|
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
deduplicated: [:info, "Deduplicated URL"],
|
10
|
+
retry: [:debug, "Not deduplicating retry"],
|
11
|
+
rerouted: [:debug, "Not deduplicating rerouted task"]
|
12
|
+
)
|
13
|
+
|
8
14
|
def call(task)
|
9
|
-
|
10
|
-
return yield if task.metadata.action
|
15
|
+
task[:barrier] ||= Wayfarer::Redis::Barrier.new(task)
|
11
16
|
|
12
|
-
|
17
|
+
if task[:job].executions > 1
|
18
|
+
log(:retry, task)
|
19
|
+
return yield if block_given?
|
20
|
+
end
|
13
21
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
task.barrier.unsee(task.url)
|
18
|
-
raise e
|
22
|
+
if task[:job] != task[:controller]
|
23
|
+
log(:rerouted, task)
|
24
|
+
return yield if block_given?
|
19
25
|
end
|
20
26
|
|
21
|
-
|
22
|
-
return if staged_urls.none?
|
27
|
+
return log(:deduplicated, task) if task[:barrier].check!(task[:normalized_url])
|
23
28
|
|
24
|
-
|
25
|
-
unseen = staged_urls.zip(inclusion).reject { |_, seen| seen }.map(&:first)
|
26
|
-
task.metadata.staged_urls = SortedSet.new(unseen)
|
29
|
+
yield if block_given?
|
27
30
|
end
|
28
31
|
end
|
29
32
|
end
|