wayfarer 0.4.6 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env +17 -0
- data/.github/workflows/lint.yaml +27 -0
- data/.github/workflows/release.yaml +30 -0
- data/.github/workflows/tests.yaml +21 -0
- data/.gitignore +5 -1
- data/.rubocop.yml +36 -0
- data/.vale.ini +8 -0
- data/.yardopts +1 -3
- data/Dockerfile +6 -4
- data/Gemfile +24 -0
- data/Gemfile.lock +274 -164
- data/Rakefile +7 -51
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +23 -13
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/callbacks.md +25 -125
- data/docs/guides/cli.md +71 -0
- data/docs/guides/configuration.md +10 -35
- data/docs/guides/development.md +67 -0
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs.md +142 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +103 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +78 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +156 -0
- data/docs/guides/tasks.md +53 -9
- data/docs/guides/tutorial.md +66 -0
- data/docs/guides/user_agents.md +115 -0
- data/docs/index.md +17 -40
- data/lib/wayfarer/base.rb +125 -46
- data/lib/wayfarer/batch_completion.rb +60 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +85 -89
- data/lib/wayfarer/cli.rb +103 -0
- data/lib/wayfarer/gc.rb +18 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/kv.rb +28 -0
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/chain.rb +7 -1
- data/lib/wayfarer/middleware/content_type.rb +59 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +22 -13
- data/lib/wayfarer/middleware/dispatch.rb +17 -4
- data/lib/wayfarer/middleware/normalize.rb +7 -14
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +31 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +14 -3
- data/lib/wayfarer/networking/ferrum.rb +1 -4
- data/lib/wayfarer/networking/follow.rb +14 -7
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +23 -13
- data/lib/wayfarer/networking/selenium.rb +15 -7
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +34 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +21 -0
- data/lib/wayfarer/redis/barrier.rb +26 -21
- data/lib/wayfarer/redis/counter.rb +18 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +166 -30
- data/lib/wayfarer/routing/hash_stack.rb +33 -0
- data/lib/wayfarer/routing/matchers/custom.rb +8 -5
- data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
- data/lib/wayfarer/routing/matchers/host.rb +15 -9
- data/lib/wayfarer/routing/matchers/path.rb +11 -31
- data/lib/wayfarer/routing/matchers/query.rb +41 -17
- data/lib/wayfarer/routing/matchers/result.rb +12 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
- data/lib/wayfarer/routing/matchers/url.rb +13 -5
- data/lib/wayfarer/routing/path_consumer.rb +130 -0
- data/lib/wayfarer/routing/path_finder.rb +151 -23
- data/lib/wayfarer/routing/result.rb +1 -1
- data/lib/wayfarer/routing/root_route.rb +17 -1
- data/lib/wayfarer/routing/route.rb +66 -19
- data/lib/wayfarer/routing/serializable.rb +28 -0
- data/lib/wayfarer/routing/sub_route.rb +53 -0
- data/lib/wayfarer/routing/target_route.rb +17 -1
- data/lib/wayfarer/stringify.rb +21 -30
- data/lib/wayfarer/task.rb +9 -17
- data/lib/wayfarer/uri/normalization.rb +120 -0
- data/lib/wayfarer.rb +72 -5
- data/mise.toml +2 -0
- data/mkdocs.yml +44 -8
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +9 -0
- data/rake/release.rake +23 -0
- data/rake/tests.rake +32 -0
- data/requirements.txt +1 -1
- data/spec/factories/job.rb +8 -0
- data/spec/factories/middleware.rb +2 -2
- data/spec/factories/path_finder.rb +11 -0
- data/spec/factories/redis.rb +19 -0
- data/spec/factories/task.rb +46 -2
- data/spec/spec_helpers.rb +55 -51
- data/spec/support/active_job_helpers.rb +8 -0
- data/spec/support/integration_helpers.rb +21 -0
- data/spec/support/redis_helpers.rb +9 -0
- data/spec/support/test_app.rb +66 -37
- data/spec/wayfarer/base_spec.rb +200 -0
- data/spec/wayfarer/batch_completion_spec.rb +142 -0
- data/spec/wayfarer/cli/job_spec.rb +88 -0
- data/spec/wayfarer/cli/routing_spec.rb +322 -0
- data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
- data/spec/wayfarer/gc_spec.rb +29 -0
- data/spec/wayfarer/handler_spec.rb +9 -0
- data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
- data/spec/wayfarer/integration/content_type_spec.rb +37 -0
- data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
- data/spec/wayfarer/integration/gc_spec.rb +40 -0
- data/spec/wayfarer/integration/handler_spec.rb +65 -0
- data/spec/wayfarer/integration/page_spec.rb +79 -0
- data/spec/wayfarer/integration/params_spec.rb +64 -0
- data/spec/wayfarer/integration/parsing_spec.rb +99 -0
- data/spec/wayfarer/integration/retry_spec.rb +112 -0
- data/spec/wayfarer/integration/stage_spec.rb +58 -0
- data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
- data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
- data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
- data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
- data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
- data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
- data/spec/wayfarer/middleware/router_spec.rb +102 -0
- data/spec/wayfarer/middleware/stage_spec.rb +63 -0
- data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
- data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
- data/spec/wayfarer/networking/capybara_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
- data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
- data/spec/wayfarer/networking/http_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
- data/spec/wayfarer/networking/selenium_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
- data/spec/wayfarer/page_spec.rb +69 -0
- data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
- data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
- data/spec/wayfarer/redis/barrier_spec.rb +39 -0
- data/spec/wayfarer/redis/counter_spec.rb +34 -0
- data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
- data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
- data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
- data/spec/wayfarer/routing/integration_spec.rb +101 -0
- data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
- data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
- data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
- data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
- data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
- data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
- data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
- data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
- data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
- data/spec/wayfarer/routing/root_route_spec.rb +51 -0
- data/spec/wayfarer/routing/route_spec.rb +74 -0
- data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
- data/spec/wayfarer/task_spec.rb +13 -0
- data/spec/wayfarer/uri/normalization_spec.rb +98 -0
- data/spec/wayfarer_spec.rb +2 -2
- data/wayfarer.gemspec +18 -28
- metadata +797 -265
- data/.github/workflows/ci.yaml +0 -32
- data/.rbenv-gemsets +0 -1
- data/.ruby-version +0 -1
- data/RELEASING.md +0 -17
- data/docs/cookbook/user_agent.md +0 -7
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/cli.md +0 -61
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/base_spec.rb +0 -224
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/cli/job_spec.rb +0 -78
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/fixtures/dummy_job.rb +0 -7
- data/spec/gc_spec.rb +0 -59
- data/spec/handler_spec.rb +0 -11
- data/spec/integration/callbacks_spec.rb +0 -85
- data/spec/integration/page_spec.rb +0 -62
- data/spec/integration/params_spec.rb +0 -56
- data/spec/integration/stage_spec.rb +0 -51
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/middleware/dedup_spec.rb +0 -88
- data/spec/middleware/dispatch_spec.rb +0 -43
- data/spec/middleware/fetch_spec.rb +0 -155
- data/spec/middleware/normalize_spec.rb +0 -29
- data/spec/middleware/router_spec.rb +0 -105
- data/spec/middleware/stage_spec.rb +0 -62
- data/spec/networking/capybara_spec.rb +0 -12
- data/spec/networking/ferrum_spec.rb +0 -12
- data/spec/networking/http_spec.rb +0 -12
- data/spec/networking/selenium_spec.rb +0 -12
- data/spec/page_spec.rb +0 -47
- data/spec/parsing/xml_spec.rb +0 -25
- data/spec/redis/barrier_spec.rb +0 -78
- data/spec/redis/counter_spec.rb +0 -32
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/integration_spec.rb +0 -110
- data/spec/routing/matchers/custom_spec.rb +0 -31
- data/spec/routing/matchers/host_spec.rb +0 -49
- data/spec/routing/matchers/path_spec.rb +0 -43
- data/spec/routing/matchers/query_spec.rb +0 -137
- data/spec/routing/matchers/scheme_spec.rb +0 -25
- data/spec/routing/matchers/suffix_spec.rb +0 -41
- data/spec/routing/matchers/uri_spec.rb +0 -27
- data/spec/routing/path_finder_spec.rb +0 -33
- data/spec/routing/root_route_spec.rb +0 -29
- data/spec/routing/route_spec.rb +0 -43
- data/spec/routing/router_spec.rb +0 -24
- data/spec/task_spec.rb +0 -34
- data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
data/lib/wayfarer/base.rb
CHANGED
@@ -1,60 +1,139 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
task = job.arguments.first
|
17
|
-
task.barrier.seen?(task.url)
|
18
|
-
task.gc.run
|
19
|
-
block.call(job, error)
|
20
|
-
end
|
21
|
-
end
|
4
|
+
# @!attribute [r] task
|
5
|
+
# @return [Wayfarer::Task] the current task
|
6
|
+
# @!attribute [r] uri
|
7
|
+
# @return [Addressable::URI] parsed task URL
|
8
|
+
# @!attribute [r] user_agent
|
9
|
+
# @return [Object] the user agent used to retrieve the page
|
10
|
+
# @!attribute [r] action
|
11
|
+
# @return [Symbol, Handler] action that the task URL was routed to
|
12
|
+
# @!attribute [r] params
|
13
|
+
# @return [HashWithIndifferentAccess] path parameters collected from routes
|
14
|
+
module Base
|
15
|
+
extend ActiveSupport::Concern
|
22
16
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
# @!method stage(urls)
|
18
|
+
# Adds URLs to an internal staging set so that they get enqueued
|
19
|
+
# eventually, once the job executed successfully.
|
20
|
+
# @overload stage(urls)
|
21
|
+
# @param urls [Array<String>] URLs to add to the staging set.
|
22
|
+
# @overload stage(url)
|
23
|
+
# @param url [String] URL to add to the staging set.
|
27
24
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
# @!method fetch(url, follow: 3)
|
26
|
+
# @param url [String] URL to fetch using plain HTTP(S).
|
27
|
+
# @param follow [Fixnum] Number of redirects to follow.
|
28
|
+
# Retrieves the given URL to a {Page}.
|
32
29
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
30
|
+
# @!method page(live: false)
|
31
|
+
# @param url [live] whether to retrieve a new {Page}.
|
32
|
+
# @return [Wayfarer::Page]
|
33
|
+
# The most recently retrieved page or a new page for the current task URL if
|
34
|
+
# the `live` keyword is passed.
|
37
35
|
|
38
|
-
|
39
|
-
super(*argv, &ErrorHandler.call(&block))
|
40
|
-
end
|
36
|
+
# @!scope class
|
41
37
|
|
42
|
-
|
43
|
-
|
44
|
-
|
38
|
+
# @!attribute [r] route
|
39
|
+
# @return [Wayfarer::Routing::DSL]
|
40
|
+
# The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
|
41
|
+
# or to a {Handler}.
|
42
|
+
# @example Append a host route
|
43
|
+
# route.host "examplxe.com", to: :index
|
45
44
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
# @!method content_types(*content_types)
|
46
|
+
# @param content_types [*Array<String, Regexp>] Content-Types to whitelist
|
47
|
+
# Whitelists Content-Types. Once at least one Content-Type is set, only
|
48
|
+
# those Content-Types will be processed.
|
49
|
+
|
50
|
+
# @!group Callbacks
|
51
|
+
|
52
|
+
# @!method before_fetch
|
53
|
+
# @overload before_fetch(callback)
|
54
|
+
# @param callback [Symbol] Instance method to call
|
55
|
+
# @overload before_fetch(&block)
|
56
|
+
# @yield [Wayfarer::Task]
|
57
|
+
# Registers a callback that is called before the page is fetched.
|
58
|
+
# If a symbol is passed, an instance method with the same name will be
|
59
|
+
# called.
|
60
|
+
# @example Accessing the user agent in {#before_fetch}
|
61
|
+
# before_fetch do |task|
|
62
|
+
# user_agent # => the user agent that will fetch the page
|
63
|
+
# end
|
51
64
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
65
|
+
# @!method around_fetch
|
66
|
+
# @overload around_fetch(callback)
|
67
|
+
# @param callback [Symbol] Instance method to call
|
68
|
+
# @overload around_fetch(&block)
|
69
|
+
# @yield [Wayfarer::Task]
|
70
|
+
# Registers a callback that is called around the page getting fetched.
|
71
|
+
# If a symbol is passed, an instance method with the same name will be
|
72
|
+
# called.
|
73
|
+
|
74
|
+
# @!method after_fetch
|
75
|
+
# @overload after_fetch(callback)
|
76
|
+
# @param callback [Symbol] Instance method to call
|
77
|
+
# @overload after_fetch(&block)
|
78
|
+
# @yield [Wayfarer::Task]
|
79
|
+
# Registers a callback that is called after the page was fetched.
|
80
|
+
# If a symbol is passed, an instance method with the same name will be
|
81
|
+
# called.
|
82
|
+
|
83
|
+
# @!method before_perform
|
84
|
+
# @overload before_perform(callback)
|
85
|
+
# @param callback [Symbol] Instance method to call
|
86
|
+
# @overload before_perform(&block)
|
87
|
+
# @yield [Wayfarer::Task]
|
88
|
+
# Registers a callback that is called before the task is performed.
|
89
|
+
# If a symbol is passed, an instance method with the same name will be
|
90
|
+
# called.
|
91
|
+
|
92
|
+
# @!method around_perform
|
93
|
+
# @overload around_perform(callback)
|
94
|
+
# @param callback [Symbol] Instance method to call
|
95
|
+
# @overload around_perform(&block)
|
96
|
+
# @yield [Wayfarer::Task]
|
97
|
+
# Registers a callback that is called around the task getting performed.
|
98
|
+
# If a symbol is passed, an instance method with the same name will be
|
99
|
+
# called.
|
100
|
+
|
101
|
+
# @!method after_perform
|
102
|
+
# @overload after_perform(callback)
|
103
|
+
# @param callback [Symbol] Instance method to call
|
104
|
+
# @overload after_perform(&block)
|
105
|
+
# @yield [Wayfarer::Task]
|
106
|
+
# Registers a callback that is called after the task was performed.
|
107
|
+
# If a symbol is passed, an instance method with the same name will be
|
108
|
+
# called.
|
109
|
+
|
110
|
+
# @!endgroup
|
111
|
+
|
112
|
+
included do
|
113
|
+
include Wayfarer::Middleware::Controller
|
114
|
+
|
115
|
+
# Implement ActiveJob's #perform by calling into our own middleware
|
116
|
+
# chain included from {Controller}
|
117
|
+
alias_method :perform, :call
|
118
|
+
|
119
|
+
use Wayfarer::Middleware::Redis
|
120
|
+
use Wayfarer::Middleware::UriParser
|
121
|
+
use Wayfarer::Middleware::Normalize
|
122
|
+
use Wayfarer::Middleware::Dedup
|
123
|
+
use Wayfarer::Middleware::BatchCompletion
|
124
|
+
use Wayfarer::Middleware::Stage
|
125
|
+
use Wayfarer::Middleware::Router
|
126
|
+
use Wayfarer::Middleware::UserAgent
|
127
|
+
use Wayfarer::Middleware::ContentType
|
128
|
+
use Wayfarer::Middleware::Dispatch
|
56
129
|
end
|
57
130
|
|
58
|
-
|
131
|
+
class_methods do
|
132
|
+
def crawl(url, batch: SecureRandom.uuid)
|
133
|
+
Task.new(url, batch).tap do |task|
|
134
|
+
perform_later(task)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
59
138
|
end
|
60
139
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# BatchCompletion tracks the completion of a batch of jobs.
|
5
|
+
# It does so by incrementing and decrementing a counter in Redis.
|
6
|
+
#
|
7
|
+
# The counter is incremented when a job is first enqueued and decremented when
|
8
|
+
# a job is performed. If a job is retried, the counter is not incremented.
|
9
|
+
# When a job succeeds or fails and thereby exceeds its retry count, the counter
|
10
|
+
# is decremented.
|
11
|
+
#
|
12
|
+
# When the counter reaches zero, garbage collection deletes the Redis keys
|
13
|
+
# associated with the batch.
|
14
|
+
module BatchCompletion
|
15
|
+
module_function
|
16
|
+
|
17
|
+
EVENTS = %w[enqueue.active_job perform.active_job retry_stopped.active_job].freeze
|
18
|
+
|
19
|
+
def subscribe!
|
20
|
+
EVENTS.each { |event| ActiveSupport::Notifications.subscribe(event, self) }
|
21
|
+
end
|
22
|
+
|
23
|
+
def call(name, _, _, _, data)
|
24
|
+
return unless (job = data[:job]).is_a?(Wayfarer::Base)
|
25
|
+
|
26
|
+
task = job.arguments.first
|
27
|
+
|
28
|
+
# In the case of `enqueue.active_job` middleware hasn't executed yet
|
29
|
+
task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
|
30
|
+
|
31
|
+
handle(name, job, task)
|
32
|
+
end
|
33
|
+
|
34
|
+
def handle(name, job, task)
|
35
|
+
counter = Wayfarer::Redis::Counter.new(task)
|
36
|
+
|
37
|
+
case name
|
38
|
+
when "enqueue.active_job" then counter.increment unless retry?(job)
|
39
|
+
when "perform.active_job" then succeed!(task, counter) if succeeded?(job, task)
|
40
|
+
when "retry_stopped.active_job" then fail!(counter)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def succeed!(task, counter)
|
45
|
+
Wayfarer::GC.run(task) if counter.decrement == 0
|
46
|
+
end
|
47
|
+
|
48
|
+
def fail!(counter)
|
49
|
+
counter.decrement
|
50
|
+
end
|
51
|
+
|
52
|
+
def retry?(job)
|
53
|
+
job.executions > 0
|
54
|
+
end
|
55
|
+
|
56
|
+
def succeeded?(job, task)
|
57
|
+
job.exception_executions == task[:initial_exception_executions]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/wayfarer/callbacks.rb
CHANGED
@@ -2,69 +2,43 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Callbacks
|
5
|
-
|
6
|
-
OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
|
5
|
+
extend ActiveSupport::Concern
|
7
6
|
|
8
|
-
|
9
|
-
|
10
|
-
return if only && !applies?(only)
|
11
|
-
return if except && applies?(except)
|
7
|
+
included do
|
8
|
+
include ActiveSupport::Callbacks
|
12
9
|
|
13
|
-
|
10
|
+
define_callbacks :fetch, skip_after_callbacks_if_terminated: true
|
11
|
+
define_callbacks :action, skip_after_callbacks_if_terminated: true
|
12
|
+
define_callbacks :batch
|
13
|
+
end
|
14
14
|
|
15
|
-
|
15
|
+
class_methods do
|
16
|
+
def before_fetch(...)
|
17
|
+
set_callback(:fetch, :before, ...)
|
16
18
|
end
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
def applies?(condition)
|
21
|
-
case condition
|
22
|
-
when Symbol then condition == action
|
23
|
-
when Enumerable then condition&.include?(action)
|
24
|
-
end
|
20
|
+
def around_fetch(...)
|
21
|
+
set_callback(:fetch, :around, ...)
|
25
22
|
end
|
26
23
|
|
27
|
-
def
|
28
|
-
|
24
|
+
def after_fetch(...)
|
25
|
+
set_callback(:fetch, :after, ...)
|
29
26
|
end
|
30
27
|
|
31
|
-
def
|
32
|
-
|
28
|
+
def before_action(...)
|
29
|
+
set_callback(:action, :before, ...)
|
33
30
|
end
|
34
31
|
|
35
|
-
def
|
36
|
-
|
32
|
+
def around_action(...)
|
33
|
+
set_callback(:action, :around, ...)
|
37
34
|
end
|
38
35
|
|
39
|
-
def
|
40
|
-
|
36
|
+
def after_action(...)
|
37
|
+
set_callback(:action, :after, ...)
|
41
38
|
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.included(base)
|
45
|
-
base.include(ActiveSupport::Callbacks)
|
46
|
-
base.extend(ClassMethods)
|
47
|
-
|
48
|
-
base.class_eval do
|
49
|
-
define_callbacks(:fetch, OPTIONS)
|
50
|
-
define_callbacks(:action, OPTIONS)
|
51
|
-
define_callbacks(:batch, OPTIONS)
|
52
|
-
|
53
|
-
define(:fetch, :before)
|
54
|
-
define(:action, :before)
|
55
|
-
define(:batch, :after)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
module ClassMethods
|
60
|
-
private
|
61
39
|
|
62
|
-
def
|
63
|
-
|
64
|
-
set_callback(name, stage, **filters) do |job|
|
65
|
-
ConditionalCallback.new(job, filters).run(method, &block)
|
66
|
-
end
|
67
|
-
end
|
40
|
+
def after_batch(...)
|
41
|
+
set_callback(:batch, :after, ...)
|
68
42
|
end
|
69
43
|
end
|
70
44
|
end
|
@@ -1,115 +1,111 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
4
|
+
class CLI
|
5
|
+
# Turns a routing tree into a Hash and prints it.
|
6
|
+
# Used by the `route` CLI subcommand.
|
7
|
+
#
|
8
|
+
# @api private
|
9
|
+
class RoutePrinter
|
10
|
+
# @return [Hash<Symbol, Proc>]
|
11
|
+
class_attribute :serializers,
|
12
|
+
default: { yaml: ->(hash) { YAML.dump(hash.deep_stringify_keys) },
|
13
|
+
json: ->(hash) { JSON.pretty_generate(hash) },
|
14
|
+
ruby: ->(hash) { pp(hash) } },
|
15
|
+
instance_accessor: false,
|
16
|
+
instance_predicate: false
|
17
|
+
|
18
|
+
BATCH = "tmp"
|
19
|
+
|
20
|
+
# Prints a routing tree.
|
21
|
+
#
|
22
|
+
# @param route [Wayfarer::Routing::Route] route to print
|
23
|
+
# @param url [String] URL to match
|
24
|
+
# @param format [String, Symbol] `:json`, `:yaml` or `:ruby`
|
25
|
+
def self.print(route, url, format:)
|
26
|
+
new(route, url, serializers.fetch(format.to_sym)).print
|
17
27
|
end
|
18
28
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
29
|
+
# @param route [Wayfarer::Routing::Route] route to print
|
30
|
+
# @param url [String] URL to match
|
31
|
+
# @param serializer [Proc<Hash=>String>] output serializer
|
32
|
+
def initialize(route, url, serializer)
|
33
|
+
@route = route
|
34
|
+
@serializer = serializer
|
35
|
+
|
36
|
+
@nodes = {}
|
37
|
+
@root_hash = nil
|
38
|
+
|
39
|
+
task = Wayfarer::Task.new(url, BATCH)
|
40
|
+
task[:uri] = Addressable::URI.parse(url)
|
41
|
+
@path_finder = Wayfarer::Routing::PathFinder.new(
|
42
|
+
task,
|
43
|
+
stop_when_found: false,
|
44
|
+
&method(:call)
|
45
|
+
)
|
23
46
|
end
|
24
47
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
puts [segments(route), label(route)].join("")[3..]
|
30
|
-
true
|
31
|
-
end
|
32
|
-
|
33
|
-
def segments(route)
|
34
|
-
current = segment(route)
|
35
|
-
parents = parents(route).map { |parent| parent_segment(parent) }
|
36
|
-
[parents, current].join
|
37
|
-
end
|
38
|
-
|
39
|
-
def parent_segment(parent)
|
40
|
-
if trailer?(parent)
|
41
|
-
INDENT
|
42
|
-
else
|
43
|
-
REGULAR_SEGMENT
|
44
|
-
end
|
45
|
-
end
|
48
|
+
# Processes the routing trees and prints the serialized output.
|
49
|
+
def print
|
50
|
+
route.accept(path_finder)
|
46
51
|
|
47
|
-
|
48
|
-
if trailer?(route)
|
49
|
-
CORNER_SEGMENT
|
50
|
-
else
|
51
|
-
JUNCTION_SEGMENT
|
52
|
-
end
|
53
|
-
end
|
52
|
+
hash = routing_result(path_finder).merge(root_hash)
|
54
53
|
|
55
|
-
|
56
|
-
[highlight_matcher(route, matcher_label(route)),
|
57
|
-
highlight_options(route, options(route)),
|
58
|
-
highlight_options(route, params(route))].compact.join(" ")
|
54
|
+
puts serializer.call(hash)
|
59
55
|
end
|
60
56
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
57
|
+
# Callback method called by `path_finder` with the result of matching
|
58
|
+
# the route.
|
59
|
+
#
|
60
|
+
# @param route [Wayfarer::Routing::Route] the current route
|
61
|
+
# @param result [true, false] routing result
|
62
|
+
# @param path_finder [Wayfarer::Routing::PathFinder] the path finder
|
63
|
+
def call(route, result, path_finder)
|
64
|
+
node = (nodes[route] ||= attributes(route, result, path_finder))
|
65
|
+
parent = route.parent
|
70
66
|
|
71
|
-
|
72
|
-
return string unless path_finder.path.include?(route)
|
67
|
+
return @root_hash ||= node unless parent
|
73
68
|
|
74
|
-
|
69
|
+
nodes.dig(parent, route_type(parent), :children).append(node)
|
75
70
|
end
|
76
71
|
|
77
|
-
|
78
|
-
return "Target" if route.is_a?(Wayfarer::Routing::TargetRoute)
|
79
|
-
|
80
|
-
route.matcher.class.name.demodulize
|
81
|
-
end
|
72
|
+
private
|
82
73
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
74
|
+
attr_reader :route,
|
75
|
+
:path_finder,
|
76
|
+
:serializer,
|
77
|
+
:nodes,
|
78
|
+
:root_hash
|
79
|
+
|
80
|
+
def routing_result(path_finder)
|
81
|
+
return { routed: false } unless path_finder.found?
|
82
|
+
|
83
|
+
action = path_finder.action
|
84
|
+
{ routed: true,
|
85
|
+
params: path_finder.params,
|
86
|
+
action: case action
|
87
|
+
when Array
|
88
|
+
{ handler: action.first.class.name, action: action.second }
|
89
|
+
else action
|
90
|
+
end }
|
94
91
|
end
|
95
92
|
|
96
|
-
def
|
97
|
-
|
98
|
-
|
93
|
+
def attributes(route, result, path_finder)
|
94
|
+
{ route_type(route) => route.to_h.merge!(
|
95
|
+
route_result(route, result, path_finder),
|
96
|
+
children: []
|
97
|
+
) }
|
99
98
|
end
|
100
99
|
|
101
|
-
|
102
|
-
|
103
|
-
def parents(route, current = [])
|
104
|
-
return current unless route.parent
|
100
|
+
def route_result(route, result, path_finder)
|
101
|
+
return {} if route.target?
|
105
102
|
|
106
|
-
|
103
|
+
{ match: result,
|
104
|
+
params: path_finder.params_stack.to_h }
|
107
105
|
end
|
108
106
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
route.parent.children.last == route
|
107
|
+
def route_type(route)
|
108
|
+
route.class.name.demodulize.underscore
|
113
109
|
end
|
114
110
|
end
|
115
111
|
end
|
data/lib/wayfarer/cli.rb
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
class CLI < Thor
|
5
|
+
def self.exit_on_failure?
|
6
|
+
true
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "version", "Print version"
|
10
|
+
def version
|
11
|
+
say Wayfarer::VERSION::STRING
|
12
|
+
end
|
13
|
+
|
14
|
+
class_option :require, aliases: :r, type: :string, default: nil
|
15
|
+
|
16
|
+
desc "route JOB URL", "Routing tree for URL for JOB"
|
17
|
+
option :format, type: :string, enum: %w[yaml json ruby], default: "yaml"
|
18
|
+
def route(job, url)
|
19
|
+
load_environment
|
20
|
+
|
21
|
+
url = parsed_url(url)
|
22
|
+
job = job.camelize.constantize
|
23
|
+
route = job.route
|
24
|
+
|
25
|
+
Wayfarer::CLI::RoutePrinter.print(route, url, format: options.fetch("format"))
|
26
|
+
end
|
27
|
+
|
28
|
+
desc "perform JOB URL", "Perform JOB with URL"
|
29
|
+
option :mock_redis, type: :boolean
|
30
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
31
|
+
def perform(job, url)
|
32
|
+
load_environment
|
33
|
+
mock_redis
|
34
|
+
|
35
|
+
job = job.camelize.constantize
|
36
|
+
task = Wayfarer::Task.new(url, options.fetch(:batch))
|
37
|
+
job.new(task).perform_now
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "enqueue JOB URL", "Enqueue JOB with URL"
|
41
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
42
|
+
def enqueue(job, url)
|
43
|
+
load_environment
|
44
|
+
|
45
|
+
job = job.camelize.constantize
|
46
|
+
job.crawl(url, batch: options[:batch])
|
47
|
+
end
|
48
|
+
|
49
|
+
desc "execute JOB URL", "Execute JOB with async adapter starting from URL"
|
50
|
+
option :mock_redis, type: :boolean
|
51
|
+
option :batch, type: :string, default: SecureRandom.uuid
|
52
|
+
option :min_threads, type: :numeric, default: 1
|
53
|
+
option :max_threads, type: :numeric, default: 1
|
54
|
+
option :retain_pool, type: :boolean, default: false
|
55
|
+
def execute(job, url)
|
56
|
+
load_environment
|
57
|
+
mock_redis
|
58
|
+
|
59
|
+
job = job.camelize.constantize
|
60
|
+
job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
|
61
|
+
max_threads: options[:max_threads])
|
62
|
+
scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
|
63
|
+
executor = scheduler.instance_variable_get(:@async_executor)
|
64
|
+
|
65
|
+
job.crawl(url, batch: options.fetch(:batch))
|
66
|
+
|
67
|
+
sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
|
68
|
+
|
69
|
+
# Used in test suite to avoid pool recreation
|
70
|
+
Wayfarer::Networking::Pool.instance.free unless options.fetch(:retain_pool)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def mock_redis
|
76
|
+
Wayfarer.config[:redis][:factory] = ->(_) { MockRedis.new } if options[:mock_redis]
|
77
|
+
end
|
78
|
+
|
79
|
+
def parsed_url(url)
|
80
|
+
Addressable::URI.parse(url).normalize
|
81
|
+
end
|
82
|
+
|
83
|
+
def load_environment(require_path = options[:require])
|
84
|
+
require File.join(Dir.pwd, require_path) if require_path
|
85
|
+
|
86
|
+
load_rails
|
87
|
+
end
|
88
|
+
|
89
|
+
def load_rails
|
90
|
+
begin
|
91
|
+
require "rails/app_loader"
|
92
|
+
rescue LoadError
|
93
|
+
return
|
94
|
+
end
|
95
|
+
|
96
|
+
return unless Rails::AppLoader.find_executable
|
97
|
+
|
98
|
+
require File.expand_path("config/application", Dir.pwd)
|
99
|
+
require File.expand_path("config/boot", Dir.pwd)
|
100
|
+
require File.expand_path("config/environment", Dir.pwd)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/lib/wayfarer/gc.rb
CHANGED
@@ -1,14 +1,26 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
# TODO: Add logging
|
5
|
+
module GC
|
6
|
+
RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
|
7
7
|
|
8
|
-
|
8
|
+
class << self
|
9
|
+
include Wayfarer::Logging.emit(
|
10
|
+
after_batch: [:debug, "Running `after_batch` callback"],
|
11
|
+
gc: [:debug, "Garbage collecting %<resettable>s"]
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
module_function
|
16
|
+
|
17
|
+
def run(task)
|
18
|
+
task[:job].run_callbacks(:batch)
|
9
19
|
|
10
|
-
|
11
|
-
|
20
|
+
RESETTABLES.each do |resettable|
|
21
|
+
log(:gc, task, resettable: resettable)
|
22
|
+
resettable.new(task).reset!
|
23
|
+
end
|
12
24
|
end
|
13
25
|
end
|
14
26
|
end
|