wayfarer 0.4.6 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env +17 -0
- data/.github/workflows/lint.yaml +27 -0
- data/.github/workflows/release.yaml +30 -0
- data/.github/workflows/tests.yaml +21 -0
- data/.gitignore +5 -1
- data/.rubocop.yml +36 -0
- data/.vale.ini +8 -0
- data/.yardopts +1 -3
- data/Dockerfile +6 -4
- data/Gemfile +24 -0
- data/Gemfile.lock +274 -164
- data/Rakefile +7 -51
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +23 -13
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/callbacks.md +25 -125
- data/docs/guides/cli.md +71 -0
- data/docs/guides/configuration.md +10 -35
- data/docs/guides/development.md +67 -0
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs.md +142 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +103 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +78 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +156 -0
- data/docs/guides/tasks.md +53 -9
- data/docs/guides/tutorial.md +66 -0
- data/docs/guides/user_agents.md +115 -0
- data/docs/index.md +17 -40
- data/lib/wayfarer/base.rb +125 -46
- data/lib/wayfarer/batch_completion.rb +60 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +85 -89
- data/lib/wayfarer/cli.rb +103 -0
- data/lib/wayfarer/gc.rb +18 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/kv.rb +28 -0
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/chain.rb +7 -1
- data/lib/wayfarer/middleware/content_type.rb +59 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +22 -13
- data/lib/wayfarer/middleware/dispatch.rb +17 -4
- data/lib/wayfarer/middleware/normalize.rb +7 -14
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +31 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +14 -3
- data/lib/wayfarer/networking/ferrum.rb +1 -4
- data/lib/wayfarer/networking/follow.rb +14 -7
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +23 -13
- data/lib/wayfarer/networking/selenium.rb +15 -7
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +34 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +21 -0
- data/lib/wayfarer/redis/barrier.rb +26 -21
- data/lib/wayfarer/redis/counter.rb +18 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +166 -30
- data/lib/wayfarer/routing/hash_stack.rb +33 -0
- data/lib/wayfarer/routing/matchers/custom.rb +8 -5
- data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
- data/lib/wayfarer/routing/matchers/host.rb +15 -9
- data/lib/wayfarer/routing/matchers/path.rb +11 -31
- data/lib/wayfarer/routing/matchers/query.rb +41 -17
- data/lib/wayfarer/routing/matchers/result.rb +12 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
- data/lib/wayfarer/routing/matchers/url.rb +13 -5
- data/lib/wayfarer/routing/path_consumer.rb +130 -0
- data/lib/wayfarer/routing/path_finder.rb +151 -23
- data/lib/wayfarer/routing/result.rb +1 -1
- data/lib/wayfarer/routing/root_route.rb +17 -1
- data/lib/wayfarer/routing/route.rb +66 -19
- data/lib/wayfarer/routing/serializable.rb +28 -0
- data/lib/wayfarer/routing/sub_route.rb +53 -0
- data/lib/wayfarer/routing/target_route.rb +17 -1
- data/lib/wayfarer/stringify.rb +21 -30
- data/lib/wayfarer/task.rb +9 -17
- data/lib/wayfarer/uri/normalization.rb +120 -0
- data/lib/wayfarer.rb +72 -5
- data/mise.toml +2 -0
- data/mkdocs.yml +44 -8
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +9 -0
- data/rake/release.rake +23 -0
- data/rake/tests.rake +32 -0
- data/requirements.txt +1 -1
- data/spec/factories/job.rb +8 -0
- data/spec/factories/middleware.rb +2 -2
- data/spec/factories/path_finder.rb +11 -0
- data/spec/factories/redis.rb +19 -0
- data/spec/factories/task.rb +46 -2
- data/spec/spec_helpers.rb +55 -51
- data/spec/support/active_job_helpers.rb +8 -0
- data/spec/support/integration_helpers.rb +21 -0
- data/spec/support/redis_helpers.rb +9 -0
- data/spec/support/test_app.rb +66 -37
- data/spec/wayfarer/base_spec.rb +200 -0
- data/spec/wayfarer/batch_completion_spec.rb +142 -0
- data/spec/wayfarer/cli/job_spec.rb +88 -0
- data/spec/wayfarer/cli/routing_spec.rb +322 -0
- data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
- data/spec/wayfarer/gc_spec.rb +29 -0
- data/spec/wayfarer/handler_spec.rb +9 -0
- data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
- data/spec/wayfarer/integration/content_type_spec.rb +37 -0
- data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
- data/spec/wayfarer/integration/gc_spec.rb +40 -0
- data/spec/wayfarer/integration/handler_spec.rb +65 -0
- data/spec/wayfarer/integration/page_spec.rb +79 -0
- data/spec/wayfarer/integration/params_spec.rb +64 -0
- data/spec/wayfarer/integration/parsing_spec.rb +99 -0
- data/spec/wayfarer/integration/retry_spec.rb +112 -0
- data/spec/wayfarer/integration/stage_spec.rb +58 -0
- data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
- data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
- data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
- data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
- data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
- data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
- data/spec/wayfarer/middleware/router_spec.rb +102 -0
- data/spec/wayfarer/middleware/stage_spec.rb +63 -0
- data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
- data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
- data/spec/wayfarer/networking/capybara_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
- data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
- data/spec/wayfarer/networking/http_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
- data/spec/wayfarer/networking/selenium_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
- data/spec/wayfarer/page_spec.rb +69 -0
- data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
- data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
- data/spec/wayfarer/redis/barrier_spec.rb +39 -0
- data/spec/wayfarer/redis/counter_spec.rb +34 -0
- data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
- data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
- data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
- data/spec/wayfarer/routing/integration_spec.rb +101 -0
- data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
- data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
- data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
- data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
- data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
- data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
- data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
- data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
- data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
- data/spec/wayfarer/routing/root_route_spec.rb +51 -0
- data/spec/wayfarer/routing/route_spec.rb +74 -0
- data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
- data/spec/wayfarer/task_spec.rb +13 -0
- data/spec/wayfarer/uri/normalization_spec.rb +98 -0
- data/spec/wayfarer_spec.rb +2 -2
- data/wayfarer.gemspec +18 -28
- metadata +797 -265
- data/.github/workflows/ci.yaml +0 -32
- data/.rbenv-gemsets +0 -1
- data/.ruby-version +0 -1
- data/RELEASING.md +0 -17
- data/docs/cookbook/user_agent.md +0 -7
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/cli.md +0 -61
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/base_spec.rb +0 -224
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/cli/job_spec.rb +0 -78
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/fixtures/dummy_job.rb +0 -7
- data/spec/gc_spec.rb +0 -59
- data/spec/handler_spec.rb +0 -11
- data/spec/integration/callbacks_spec.rb +0 -85
- data/spec/integration/page_spec.rb +0 -62
- data/spec/integration/params_spec.rb +0 -56
- data/spec/integration/stage_spec.rb +0 -51
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/middleware/dedup_spec.rb +0 -88
- data/spec/middleware/dispatch_spec.rb +0 -43
- data/spec/middleware/fetch_spec.rb +0 -155
- data/spec/middleware/normalize_spec.rb +0 -29
- data/spec/middleware/router_spec.rb +0 -105
- data/spec/middleware/stage_spec.rb +0 -62
- data/spec/networking/capybara_spec.rb +0 -12
- data/spec/networking/ferrum_spec.rb +0 -12
- data/spec/networking/http_spec.rb +0 -12
- data/spec/networking/selenium_spec.rb +0 -12
- data/spec/page_spec.rb +0 -47
- data/spec/parsing/xml_spec.rb +0 -25
- data/spec/redis/barrier_spec.rb +0 -78
- data/spec/redis/counter_spec.rb +0 -32
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/integration_spec.rb +0 -110
- data/spec/routing/matchers/custom_spec.rb +0 -31
- data/spec/routing/matchers/host_spec.rb +0 -49
- data/spec/routing/matchers/path_spec.rb +0 -43
- data/spec/routing/matchers/query_spec.rb +0 -137
- data/spec/routing/matchers/scheme_spec.rb +0 -25
- data/spec/routing/matchers/suffix_spec.rb +0 -41
- data/spec/routing/matchers/uri_spec.rb +0 -27
- data/spec/routing/path_finder_spec.rb +0 -33
- data/spec/routing/root_route_spec.rb +0 -29
- data/spec/routing/route_spec.rb +0 -43
- data/spec/routing/router_spec.rb +0 -24
- data/spec/task_spec.rb +0 -34
- data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
data/lib/wayfarer/handler.rb
CHANGED
@@ -1,15 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
4
|
+
module Handler
|
5
|
+
extend ActiveSupport::Concern
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
included do
|
8
|
+
include Wayfarer::Middleware::Controller
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
use Wayfarer::Middleware::ContentType
|
11
|
+
use Wayfarer::Middleware::Router
|
12
|
+
use Wayfarer::Middleware::Dispatch
|
12
13
|
|
13
|
-
|
14
|
+
api Wayfarer::Middleware::UserAgent
|
15
|
+
api Wayfarer::Middleware::Stage
|
16
|
+
|
17
|
+
singleton_class.undef_method :before_fetch
|
18
|
+
singleton_class.undef_method :around_fetch
|
19
|
+
singleton_class.undef_method :after_fetch
|
20
|
+
singleton_class.undef_method :after_batch
|
21
|
+
end
|
14
22
|
end
|
15
23
|
end
|
data/lib/wayfarer/kv.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# Provides a key-value store via `[]` and `[]=`.
|
5
|
+
#
|
6
|
+
# @api private
|
7
|
+
module KV
|
8
|
+
# @param key [Object] key to fetch
|
9
|
+
# @return [Object, nil] value associated with the key or `nil`
|
10
|
+
def [](key)
|
11
|
+
kv[key]
|
12
|
+
end
|
13
|
+
|
14
|
+
# @param key [Object] key to set
|
15
|
+
# @param value [Object] value to set
|
16
|
+
# @return [Object] value that was set
|
17
|
+
def []=(key, value)
|
18
|
+
kv[key] = value
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# @return [Hash<Object, Object>]
|
24
|
+
def kv
|
25
|
+
@kv ||= {}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Logging
|
5
|
+
mattr_accessor :logger, default: ActiveSupport::Logger.new($stdout)
|
6
|
+
|
7
|
+
def self.emit(...)
|
8
|
+
Emitter.new(...)
|
9
|
+
end
|
10
|
+
|
11
|
+
class Emitter < Module
|
12
|
+
def initialize(messages)
|
13
|
+
@messages = messages
|
14
|
+
|
15
|
+
super()
|
16
|
+
end
|
17
|
+
|
18
|
+
def included(base)
|
19
|
+
messages = @messages
|
20
|
+
|
21
|
+
base.class_eval do
|
22
|
+
define_method(:log) do |key, task, **args|
|
23
|
+
level, msg = messages[key] || raise(ArgumentError, "No log message for #{key.inspect}")
|
24
|
+
severity = ActiveSupport::Logger::Severity.const_get(level.upcase)
|
25
|
+
|
26
|
+
ActiveSupport::TaggedLogging
|
27
|
+
.new(Logging.logger)
|
28
|
+
.tagged(task.batch, task.url, task[:controller]&.class&.name) do |logger|
|
29
|
+
logger.add(severity, msg % args)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private_constant :Emitter
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class BatchCompletion
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
def call(task)
|
9
|
+
# Comparing to the initial state of `exception_executions` allows
|
10
|
+
# us to determine if an exception occurred when the job was performed,
|
11
|
+
# since the `perform.active_job` event is emitted for both successful
|
12
|
+
# and raising jobs.
|
13
|
+
task[:initial_exception_executions] ||= task[:job].exception_executions.clone
|
14
|
+
|
15
|
+
yield if block_given?
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -2,9 +2,15 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
|
-
Chain
|
5
|
+
class Chain
|
6
6
|
extend Forwardable
|
7
7
|
|
8
|
+
attr_reader :middlewares
|
9
|
+
|
10
|
+
def initialize(middlewares)
|
11
|
+
@middlewares = middlewares
|
12
|
+
end
|
13
|
+
|
8
14
|
def self.empty
|
9
15
|
new([])
|
10
16
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class ContentType
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
module API
|
9
|
+
extend ActiveSupport::Concern
|
10
|
+
|
11
|
+
included do
|
12
|
+
class_attribute :allowed_content_types,
|
13
|
+
default: { index: {}, patterns: Set.new },
|
14
|
+
instance_accessor: false,
|
15
|
+
instance_predicate: false
|
16
|
+
end
|
17
|
+
|
18
|
+
class_methods do
|
19
|
+
def content_type(*types)
|
20
|
+
grouped_types = types.group_by(&:class)
|
21
|
+
|
22
|
+
new_index = grouped_types.fetch(String, []).index_with { true }
|
23
|
+
new_patterns = grouped_types.fetch(Regexp, [])
|
24
|
+
|
25
|
+
self.allowed_content_types = {
|
26
|
+
index: allowed_content_types.fetch(:index).merge(new_index),
|
27
|
+
patterns: allowed_content_types.fetch(:patterns).merge(new_patterns)
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def call(task)
|
34
|
+
yield if block_given? && permitted?(task)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def permitted?(task)
|
40
|
+
job = task[:controller]
|
41
|
+
rules = job.class.allowed_content_types
|
42
|
+
|
43
|
+
return true if no_restrictions?(rules)
|
44
|
+
|
45
|
+
return false unless (content_type = task[:page].mime_type&.to_s || task[:page].content_type)
|
46
|
+
|
47
|
+
content_type && matches_rules?(content_type, rules)
|
48
|
+
end
|
49
|
+
|
50
|
+
def matches_rules?(type, rules)
|
51
|
+
rules.fetch(:index).key?(type) || rules.fetch(:patterns).any? { |p| p.match?(type) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def no_restrictions?(rules)
|
55
|
+
rules.values_at(:index, :patterns).all?(&:empty?)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -3,16 +3,20 @@
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
5
|
module Controller
|
6
|
-
|
7
|
-
base.cattr_accessor :chain, default: Chain.empty
|
8
|
-
base.attr_accessor :task
|
6
|
+
extend ActiveSupport::Concern
|
9
7
|
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
included do
|
9
|
+
class_attribute :chain,
|
10
|
+
default: Chain.empty,
|
11
|
+
instance_accessor: false,
|
12
|
+
instance_predicate: false
|
13
|
+
|
14
|
+
attr_accessor :task
|
15
|
+
|
16
|
+
include Wayfarer::Callbacks
|
13
17
|
end
|
14
18
|
|
15
|
-
|
19
|
+
class_methods do
|
16
20
|
def use(middleware)
|
17
21
|
chain.push(middleware.lazy)
|
18
22
|
api(middleware)
|
@@ -23,17 +27,17 @@ module Wayfarer
|
|
23
27
|
end
|
24
28
|
end
|
25
29
|
|
26
|
-
|
27
|
-
|
28
|
-
self.task = task
|
30
|
+
def call(task)
|
31
|
+
self.task = task
|
29
32
|
|
30
|
-
|
31
|
-
|
33
|
+
task[:job] ||= self
|
34
|
+
task[:controller] = self
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
+
self.class.chain.call(task) do
|
37
|
+
yield if block_given?
|
36
38
|
end
|
39
|
+
|
40
|
+
task[:return_value]
|
37
41
|
end
|
38
42
|
end
|
39
43
|
end
|
@@ -5,25 +5,34 @@ module Wayfarer
|
|
5
5
|
class Dedup
|
6
6
|
extend Base
|
7
7
|
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
deduplicated: [:info, "Deduplicated URL"],
|
10
|
+
retry: [:debug, "Not deduplicating retry"],
|
11
|
+
rerouted: [:debug, "Not deduplicating rerouted task"]
|
12
|
+
)
|
13
|
+
|
8
14
|
def call(task)
|
9
|
-
|
10
|
-
return yield if task.metadata.action
|
15
|
+
task[:barrier] ||= Wayfarer::Redis::Barrier.new(task)
|
11
16
|
|
12
|
-
|
17
|
+
if task[:job].executions > 1
|
18
|
+
log(:retry, task)
|
19
|
+
return (yield if block_given?)
|
20
|
+
end
|
13
21
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
task.barrier.unsee(task.url)
|
18
|
-
raise e
|
22
|
+
if task[:job] != task[:controller]
|
23
|
+
log(:rerouted, task)
|
24
|
+
return (yield if block_given?)
|
19
25
|
end
|
20
26
|
|
21
|
-
|
22
|
-
|
27
|
+
return log(:deduplicated, task) if task[:barrier].check!(key(task))
|
28
|
+
|
29
|
+
yield if block_given?
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
23
33
|
|
24
|
-
|
25
|
-
|
26
|
-
task.metadata.staged_urls = SortedSet.new(unseen)
|
34
|
+
def key(task)
|
35
|
+
Wayfarer.config.dig(:deduplication, :key).call(task)
|
27
36
|
end
|
28
37
|
end
|
29
38
|
end
|
@@ -5,13 +5,26 @@ module Wayfarer
|
|
5
5
|
class Dispatch
|
6
6
|
extend Base
|
7
7
|
|
8
|
+
InvalidTargetError = Class.new(ArgumentError)
|
9
|
+
|
8
10
|
def call(task)
|
9
|
-
controller = task
|
11
|
+
controller = task[:controller]
|
10
12
|
|
11
|
-
controller.run_callbacks(:action) do
|
12
|
-
case action = task
|
13
|
+
task[:return_value] = controller.run_callbacks(:action) do
|
14
|
+
case action = task[:action]
|
13
15
|
when Symbol then controller.public_send(action)
|
14
|
-
|
16
|
+
when Array
|
17
|
+
handler, method = action
|
18
|
+
task[:action] = method
|
19
|
+
handler.new.call(task)
|
20
|
+
else
|
21
|
+
unless action&.include?(Wayfarer::Handler)
|
22
|
+
raise InvalidTargetError, "routed to invalid action: #{action.inspect}"
|
23
|
+
end
|
24
|
+
|
25
|
+
task[:action] = nil
|
26
|
+
|
27
|
+
action.new.call(task)
|
15
28
|
end
|
16
29
|
end
|
17
30
|
|
@@ -4,23 +4,16 @@ module Wayfarer
|
|
4
4
|
module Middleware
|
5
5
|
class Normalize
|
6
6
|
extend Base
|
7
|
+
include Wayfarer::Logging.emit(
|
8
|
+
invalid: [:info, "Failed to normalize URL"]
|
9
|
+
)
|
7
10
|
|
8
11
|
def call(task)
|
9
|
-
|
10
|
-
|
11
|
-
task.metadata.staged_urls = SortedSet.new(normalized_urls(task).compact)
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
12
|
+
Wayfarer::URI::Normalization.canonical!(task[:uri])
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
def normalize(url)
|
21
|
-
NormalizeUrl.process(url)
|
22
|
-
rescue NormalizeUrl::InvalidURIError
|
23
|
-
nil
|
14
|
+
yield if block_given?
|
15
|
+
rescue Wayfarer::URI::Normalization::InvalidURIError
|
16
|
+
log(:invalid, task)
|
24
17
|
end
|
25
18
|
end
|
26
19
|
end
|
@@ -5,51 +5,49 @@ module Wayfarer
|
|
5
5
|
class Router
|
6
6
|
extend Base
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
module ClassMethods
|
15
|
-
def router
|
16
|
-
# TODO: Use cattr_accessor
|
17
|
-
@router ||= Wayfarer::Routing::Router.new
|
18
|
-
end
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
mismatch: [:info, "No matching route"],
|
10
|
+
match: [:info, "Routing to %<action>s"],
|
11
|
+
already_routed: [:debug, "Already routed to %<action>s"]
|
12
|
+
)
|
19
13
|
|
20
|
-
|
21
|
-
|
22
|
-
end
|
14
|
+
module API
|
15
|
+
extend ActiveSupport::Concern
|
23
16
|
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
included do
|
18
|
+
class_attribute :route,
|
19
|
+
default: Wayfarer::Routing::RootRoute.new,
|
20
|
+
instance_accessor: false,
|
21
|
+
instance_predicate: false
|
27
22
|
end
|
28
23
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
24
|
+
def action
|
25
|
+
task[:action]
|
26
|
+
end
|
33
27
|
|
34
|
-
|
35
|
-
|
36
|
-
end
|
28
|
+
def params
|
29
|
+
task[:params]
|
37
30
|
end
|
38
31
|
end
|
39
32
|
|
40
33
|
def call(task)
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
# Avoid rerouting when dispatching a [Controller, :action] pair
|
35
|
+
if (action = task[:action])
|
36
|
+
log(:already_routed, task, action: action)
|
37
|
+
|
38
|
+
return (yield if block_given?)
|
39
|
+
end
|
40
|
+
|
41
|
+
case result = task[:controller].class.route.invoke(task)
|
42
|
+
when Routing::Result::Mismatch then return log(:mismatch, task)
|
49
43
|
when Routing::Result::Match
|
50
|
-
|
51
|
-
|
52
|
-
task.
|
44
|
+
action = result.action
|
45
|
+
|
46
|
+
log(:match, task, action: action.inspect)
|
47
|
+
|
48
|
+
task[:action] = action
|
49
|
+
task[:params] ||= ActiveSupport::HashWithIndifferentAccess.new
|
50
|
+
task[:params].merge!(result.params)
|
53
51
|
end
|
54
52
|
|
55
53
|
yield if block_given?
|
@@ -7,20 +7,20 @@ module Wayfarer
|
|
7
7
|
|
8
8
|
module API
|
9
9
|
def stage(urls)
|
10
|
-
Array.wrap(urls).each { |url| task
|
10
|
+
Array.wrap(urls).each { |url| task[:staged_urls].add(url.to_s) }
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
14
|
def call(task)
|
15
|
-
task
|
15
|
+
task[:staged_urls] = Set.new
|
16
16
|
|
17
17
|
yield if block_given?
|
18
18
|
|
19
|
-
task
|
20
|
-
task
|
19
|
+
task[:staged_urls].each do |url|
|
20
|
+
task[:job].class.crawl(url, batch: task.batch)
|
21
21
|
end
|
22
22
|
|
23
|
-
task
|
23
|
+
task[:staged_urls].clear
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class UriParser
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
invalid: [:info, "Not processing invalid URL (%<message>s)"]
|
10
|
+
)
|
11
|
+
|
12
|
+
module API
|
13
|
+
def uri
|
14
|
+
task[:uri]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def call(task)
|
19
|
+
return (yield if block_given?) if task[:uri]
|
20
|
+
|
21
|
+
begin
|
22
|
+
task[:uri] = Addressable::URI.parse(task.url)
|
23
|
+
rescue Addressable::URI::InvalidURIError => e
|
24
|
+
return log(:invalid, task, message: e.message)
|
25
|
+
end
|
26
|
+
|
27
|
+
yield if block_given?
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class UserAgent
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
module API
|
9
|
+
def user_agent
|
10
|
+
task[:context]&.instance
|
11
|
+
end
|
12
|
+
|
13
|
+
def page(live: false)
|
14
|
+
return task[:page] unless live
|
15
|
+
|
16
|
+
task[:page] = task[:context].live&.page || task[:page]
|
17
|
+
end
|
18
|
+
|
19
|
+
def fetch(url, follow: 3)
|
20
|
+
(@http ||= Wayfarer::Networking::Follow.http).fetch(url, follow: follow)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def call(task)
|
25
|
+
pool.with do |context|
|
26
|
+
task[:context] = context
|
27
|
+
|
28
|
+
result = task[:controller].run_callbacks(:fetch) do
|
29
|
+
context.fetch(task.url)
|
30
|
+
end
|
31
|
+
|
32
|
+
case result
|
33
|
+
when Networking::Result::Redirect
|
34
|
+
task[:controller].stage(result.redirect_url)
|
35
|
+
when Networking::Result::Success
|
36
|
+
task[:page] = result.page
|
37
|
+
yield if block_given?
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def pool
|
45
|
+
Wayfarer::Networking::Pool.instance
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -2,11 +2,22 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Networking
|
5
|
-
Context
|
5
|
+
class Context
|
6
|
+
attr_reader :strategy
|
7
|
+
|
8
|
+
def initialize(strategy)
|
9
|
+
@strategy = strategy
|
10
|
+
end
|
11
|
+
|
6
12
|
def fetch(url)
|
7
13
|
supervise { strategy.fetch(instance, url) }
|
8
14
|
end
|
9
15
|
|
16
|
+
def navigate(url)
|
17
|
+
fetch(url)
|
18
|
+
live
|
19
|
+
end
|
20
|
+
|
10
21
|
def live
|
11
22
|
supervise { strategy.live(instance) }
|
12
23
|
end
|
@@ -25,8 +36,8 @@ module Wayfarer
|
|
25
36
|
|
26
37
|
def supervise
|
27
38
|
yield
|
28
|
-
rescue *strategy.renew_on, *Wayfarer.config
|
29
|
-
renew
|
39
|
+
rescue *strategy.renew_on, *Wayfarer.config[:network][:renew_on] => e
|
40
|
+
renew # may raise
|
30
41
|
ensure
|
31
42
|
# If renewing raises, re-raise the originally caught exception
|
32
43
|
# TODO: Not nice this effectively swallows exceptions
|
@@ -10,13 +10,10 @@ module Wayfarer
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def create
|
13
|
-
::Ferrum::Browser.new(Wayfarer.config.ferrum
|
14
|
-
browser.headers.set(Wayfarer.config.network.http_headers)
|
15
|
-
end
|
13
|
+
::Ferrum::Browser.new(Wayfarer.config.dig(:ferrum, :options))
|
16
14
|
end
|
17
15
|
|
18
16
|
def destroy(instance)
|
19
|
-
instance.reset
|
20
17
|
instance.quit
|
21
18
|
end
|
22
19
|
|
@@ -2,17 +2,24 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Networking
|
5
|
-
|
5
|
+
class Follow
|
6
|
+
RedirectsExhaustedError = Class.new(StandardError)
|
6
7
|
|
7
|
-
|
8
|
-
|
8
|
+
def self.http
|
9
|
+
new(Wayfarer::Networking::Context.new(Wayfarer::Networking::HTTP.new))
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :context
|
9
13
|
|
10
|
-
|
14
|
+
def initialize(context)
|
15
|
+
@context = context
|
16
|
+
end
|
11
17
|
|
12
|
-
def fetch(url, follow:
|
13
|
-
raise RedirectsExhaustedError if follow
|
18
|
+
def fetch(url, follow:)
|
19
|
+
raise RedirectsExhaustedError if follow < 0
|
14
20
|
|
15
|
-
|
21
|
+
result = context.fetch(url)
|
22
|
+
case result
|
16
23
|
when Result::Success then result.page
|
17
24
|
when Result::Redirect then fetch(result.redirect_url, follow: follow - 1)
|
18
25
|
end
|