wayfarer 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env +17 -0
- data/.github/workflows/lint.yaml +8 -6
- data/.github/workflows/release.yaml +4 -3
- data/.github/workflows/tests.yaml +5 -14
- data/.gitignore +2 -2
- data/.rubocop.yml +31 -0
- data/.vale.ini +6 -3
- data/Dockerfile +3 -2
- data/Gemfile +21 -0
- data/Gemfile.lock +233 -128
- data/Rakefile +7 -0
- data/docker-compose.yml +13 -14
- data/docs/guides/callbacks.md +3 -1
- data/docs/guides/configuration.md +10 -35
- data/docs/guides/development.md +67 -0
- data/docs/guides/handlers.md +7 -7
- data/docs/guides/jobs.md +54 -11
- data/docs/guides/networking/custom_adapters.md +31 -10
- data/docs/guides/pages.md +24 -22
- data/docs/guides/routing.md +116 -34
- data/docs/guides/tasks.md +30 -10
- data/docs/guides/tutorial.md +23 -17
- data/docs/guides/user_agents.md +11 -9
- data/lib/wayfarer/base.rb +9 -8
- data/lib/wayfarer/batch_completion.rb +18 -14
- data/lib/wayfarer/callbacks.rb +14 -14
- data/lib/wayfarer/cli/route_printer.rb +78 -96
- data/lib/wayfarer/cli.rb +12 -30
- data/lib/wayfarer/gc.rb +6 -1
- data/lib/wayfarer/kv.rb +28 -0
- data/lib/wayfarer/middleware/chain.rb +7 -1
- data/lib/wayfarer/middleware/content_type.rb +20 -15
- data/lib/wayfarer/middleware/dedup.rb +9 -3
- data/lib/wayfarer/middleware/dispatch.rb +7 -2
- data/lib/wayfarer/middleware/normalize.rb +4 -12
- data/lib/wayfarer/middleware/router.rb +1 -1
- data/lib/wayfarer/middleware/uri_parser.rb +4 -3
- data/lib/wayfarer/networking/context.rb +12 -1
- data/lib/wayfarer/networking/ferrum.rb +1 -4
- data/lib/wayfarer/networking/follow.rb +2 -1
- data/lib/wayfarer/networking/pool.rb +12 -7
- data/lib/wayfarer/networking/selenium.rb +15 -7
- data/lib/wayfarer/page.rb +0 -2
- data/lib/wayfarer/parsing/xml.rb +1 -1
- data/lib/wayfarer/parsing.rb +2 -5
- data/lib/wayfarer/redis/barrier.rb +15 -2
- data/lib/wayfarer/redis/counter.rb +1 -2
- data/lib/wayfarer/routing/dsl.rb +166 -31
- data/lib/wayfarer/routing/hash_stack.rb +33 -0
- data/lib/wayfarer/routing/matchers/custom.rb +8 -5
- data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
- data/lib/wayfarer/routing/matchers/host.rb +15 -9
- data/lib/wayfarer/routing/matchers/path.rb +11 -33
- data/lib/wayfarer/routing/matchers/query.rb +41 -17
- data/lib/wayfarer/routing/matchers/result.rb +12 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
- data/lib/wayfarer/routing/matchers/url.rb +13 -5
- data/lib/wayfarer/routing/path_consumer.rb +130 -0
- data/lib/wayfarer/routing/path_finder.rb +151 -23
- data/lib/wayfarer/routing/result.rb +1 -1
- data/lib/wayfarer/routing/root_route.rb +14 -2
- data/lib/wayfarer/routing/route.rb +71 -14
- data/lib/wayfarer/routing/serializable.rb +28 -0
- data/lib/wayfarer/routing/sub_route.rb +53 -0
- data/lib/wayfarer/routing/target_route.rb +17 -1
- data/lib/wayfarer/stringify.rb +1 -2
- data/lib/wayfarer/task.rb +3 -5
- data/lib/wayfarer/uri/normalization.rb +120 -0
- data/lib/wayfarer.rb +50 -10
- data/mise.toml +2 -0
- data/mkdocs.yml +8 -17
- data/rake/lint.rake +0 -96
- data/rake/release.rake +5 -11
- data/rake/tests.rake +8 -4
- data/requirements.txt +1 -1
- data/spec/factories/job.rb +8 -0
- data/spec/factories/middleware.rb +2 -2
- data/spec/factories/path_finder.rb +11 -0
- data/spec/factories/redis.rb +19 -0
- data/spec/factories/task.rb +39 -1
- data/spec/spec_helpers.rb +50 -57
- data/spec/support/active_job_helpers.rb +8 -0
- data/spec/support/integration_helpers.rb +21 -0
- data/spec/support/redis_helpers.rb +9 -0
- data/spec/support/test_app.rb +64 -43
- data/spec/{base_spec.rb → wayfarer/base_spec.rb} +32 -36
- data/spec/wayfarer/batch_completion_spec.rb +142 -0
- data/spec/wayfarer/cli/job_spec.rb +88 -0
- data/spec/wayfarer/cli/routing_spec.rb +322 -0
- data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
- data/spec/wayfarer/gc_spec.rb +29 -0
- data/spec/{handler_spec.rb → wayfarer/handler_spec.rb} +1 -3
- data/spec/{integration → wayfarer/integration}/callbacks_spec.rb +9 -6
- data/spec/wayfarer/integration/content_type_spec.rb +37 -0
- data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
- data/spec/{integration → wayfarer/integration}/gc_spec.rb +9 -13
- data/spec/{integration → wayfarer/integration}/handler_spec.rb +9 -10
- data/spec/{integration → wayfarer/integration}/page_spec.rb +8 -6
- data/spec/{integration → wayfarer/integration}/params_spec.rb +4 -4
- data/spec/{integration → wayfarer/integration}/parsing_spec.rb +7 -33
- data/spec/wayfarer/integration/retry_spec.rb +112 -0
- data/spec/{integration → wayfarer/integration}/stage_spec.rb +5 -5
- data/spec/{middleware → wayfarer/middleware}/batch_completion_spec.rb +4 -5
- data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +20 -15
- data/spec/{middleware → wayfarer/middleware}/content_type_spec.rb +18 -21
- data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +22 -20
- data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
- data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
- data/spec/{middleware → wayfarer/middleware}/router_spec.rb +18 -20
- data/spec/{middleware → wayfarer/middleware}/stage_spec.rb +11 -10
- data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
- data/spec/{middleware → wayfarer/middleware}/user_agent_spec.rb +34 -32
- data/spec/wayfarer/networking/capybara_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
- data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/follow_spec.rb +9 -4
- data/spec/wayfarer/networking/http_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/pool_spec.rb +11 -9
- data/spec/wayfarer/networking/selenium_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
- data/spec/{page_spec.rb → wayfarer/page_spec.rb} +3 -3
- data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
- data/spec/{parsing/xml_spec.rb → wayfarer/parsing/xml_parse_spec.rb} +4 -3
- data/spec/{redis → wayfarer/redis}/barrier_spec.rb +5 -4
- data/spec/wayfarer/redis/counter_spec.rb +34 -0
- data/spec/{redis → wayfarer/redis}/pool_spec.rb +3 -2
- data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
- data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
- data/spec/wayfarer/routing/integration_spec.rb +101 -0
- data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
- data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
- data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
- data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
- data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
- data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
- data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
- data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
- data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
- data/spec/wayfarer/routing/root_route_spec.rb +51 -0
- data/spec/wayfarer/routing/route_spec.rb +74 -0
- data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
- data/spec/wayfarer/uri/normalization_spec.rb +98 -0
- data/spec/wayfarer_spec.rb +2 -2
- data/wayfarer.gemspec +17 -28
- metadata +768 -246
- data/.rbenv-gemsets +0 -1
- data/.ruby-version +0 -1
- data/RELEASING.md +0 -17
- data/docs/cookbook/user_agent.md +0 -7
- data/docs/design.md +0 -36
- data/docs/guides/jobs/error_handling.md +0 -40
- data/docs/reference/configuration.md +0 -36
- data/spec/batch_completion_spec.rb +0 -104
- data/spec/cli/job_spec.rb +0 -74
- data/spec/cli/routing_spec.rb +0 -101
- data/spec/fixtures/dummy_job.rb +0 -9
- data/spec/gc_spec.rb +0 -17
- data/spec/integration/content_type_spec.rb +0 -145
- data/spec/integration/routing_spec.rb +0 -18
- data/spec/middleware/dedup_spec.rb +0 -71
- data/spec/middleware/dispatch_spec.rb +0 -59
- data/spec/middleware/normalize_spec.rb +0 -60
- data/spec/middleware/uri_parser_spec.rb +0 -53
- data/spec/networking/capybara_spec.rb +0 -12
- data/spec/networking/ferrum_spec.rb +0 -12
- data/spec/networking/http_spec.rb +0 -12
- data/spec/networking/selenium_spec.rb +0 -12
- data/spec/redis/counter_spec.rb +0 -44
- data/spec/routing/integration_spec.rb +0 -110
- data/spec/routing/matchers/custom_spec.rb +0 -31
- data/spec/routing/matchers/host_spec.rb +0 -49
- data/spec/routing/matchers/path_spec.rb +0 -43
- data/spec/routing/matchers/query_spec.rb +0 -137
- data/spec/routing/matchers/scheme_spec.rb +0 -25
- data/spec/routing/matchers/suffix_spec.rb +0 -41
- data/spec/routing/matchers/uri_spec.rb +0 -27
- data/spec/routing/path_finder_spec.rb +0 -33
- data/spec/routing/root_route_spec.rb +0 -29
- data/spec/routing/route_spec.rb +0 -43
- data/docs/{reference → guides}/cli.md +0 -0
- data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
- /data/spec/{task_spec.rb → wayfarer/task_spec.rb} +0 -0
@@ -5,33 +5,90 @@ module Wayfarer
|
|
5
5
|
class Route
|
6
6
|
include DSL
|
7
7
|
include Stringify
|
8
|
-
|
8
|
+
include Serializable
|
9
9
|
|
10
|
-
attr_reader :
|
10
|
+
attr_reader :matcher,
|
11
|
+
:parent,
|
12
|
+
:children
|
11
13
|
|
12
|
-
|
13
|
-
:parent,
|
14
|
-
:action,
|
15
|
-
:path_offset
|
14
|
+
stringify :matcher
|
16
15
|
|
17
|
-
|
18
|
-
:action,
|
19
|
-
:path_offset
|
16
|
+
EMPTY_PARAMS = {}.freeze
|
20
17
|
|
21
|
-
def initialize(
|
18
|
+
def initialize(
|
19
|
+
parent: nil,
|
20
|
+
matcher: nil,
|
21
|
+
action: nil,
|
22
|
+
**options,
|
23
|
+
&block
|
24
|
+
)
|
25
|
+
raise "missing parent" unless parent || is_a?(RootRoute)
|
26
|
+
|
27
|
+
@parent = parent
|
22
28
|
@matcher = matcher
|
29
|
+
@action = action
|
30
|
+
|
23
31
|
@children = []
|
24
|
-
|
32
|
+
|
33
|
+
leaf = options.reduce(self) { |acc, (key, val)| acc.public_send(key, val) }
|
34
|
+
leaf.instance_eval(&block) if block
|
25
35
|
end
|
26
36
|
|
27
|
-
#
|
37
|
+
# @return [true, false]
|
38
|
+
def root?
|
39
|
+
parent.nil?
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [true, false]
|
43
|
+
def leaf?
|
44
|
+
children.empty?
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [false]
|
48
|
+
def target?
|
49
|
+
false
|
50
|
+
end
|
51
|
+
|
52
|
+
# Accepts a visitor for pre-order traversal.
|
28
53
|
def accept(visitor)
|
29
|
-
|
54
|
+
visitor.enter(self)
|
55
|
+
|
56
|
+
return visitor.leave unless visitor.visit(self)
|
30
57
|
|
31
58
|
children.each { |child| child.accept(visitor) }
|
59
|
+
|
60
|
+
visitor.leave
|
32
61
|
end
|
33
62
|
|
34
|
-
|
63
|
+
# @param [path_finder] PathFinder
|
64
|
+
# @return [Hash]
|
65
|
+
def params(path_finder)
|
66
|
+
matcher&.params(path_finder) || EMPTY_PARAMS
|
67
|
+
end
|
68
|
+
|
69
|
+
# @param [_path_finder] PathFinder
|
70
|
+
# @return [nil, Symbol, Wayfarer::Handler]
|
71
|
+
def action(_path_finder)
|
72
|
+
@action
|
73
|
+
end
|
74
|
+
|
75
|
+
# @param [path_finder] PathFinder
|
76
|
+
# @return [Result::Match, Result::Mismatch, Object]
|
77
|
+
def match(path_finder)
|
78
|
+
evaluate(path_finder)
|
79
|
+
end
|
80
|
+
|
81
|
+
# @param [path_finder] PathFinder
|
82
|
+
# @return [true, false, Wayfarer::Routing::Route]
|
83
|
+
def evaluate(path_finder)
|
84
|
+
matcher.evaluate(path_finder)
|
85
|
+
end
|
86
|
+
|
87
|
+
def to_h
|
88
|
+
return {} unless matcher
|
89
|
+
|
90
|
+
{ matcher.class.name.demodulize.underscore => matcher.to_h }
|
91
|
+
end
|
35
92
|
end
|
36
93
|
end
|
37
94
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# lib/wayfarer/routing/hash_serialisable.rb
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module Wayfarer
|
5
|
+
module Routing
|
6
|
+
module Serializable
|
7
|
+
def to_h
|
8
|
+
as_hash(self)
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def as_hash(route)
|
14
|
+
{
|
15
|
+
matcher: matcher_name(route),
|
16
|
+
action: route.action(nil),
|
17
|
+
children: route.children.map { |child| as_hash(child) }
|
18
|
+
}.tap(&:compact!)
|
19
|
+
end
|
20
|
+
|
21
|
+
def matcher_name(route)
|
22
|
+
return nil unless route.matcher
|
23
|
+
|
24
|
+
route.matcher.class.name.split("::").last
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Routing
|
5
|
+
class SubRoute < Route
|
6
|
+
def evaluate(path_finder)
|
7
|
+
handle(
|
8
|
+
path_finder,
|
9
|
+
match: ->(_result) { true },
|
10
|
+
mismatch: ->(_result) { false }
|
11
|
+
)
|
12
|
+
end
|
13
|
+
|
14
|
+
def params(path_finder)
|
15
|
+
handle(
|
16
|
+
path_finder,
|
17
|
+
match: lambda(&:params),
|
18
|
+
mismatch: ->(_result) { EMPTY_PARAMS }
|
19
|
+
)
|
20
|
+
end
|
21
|
+
|
22
|
+
def action(path_finder)
|
23
|
+
handle(
|
24
|
+
path_finder,
|
25
|
+
match: lambda(&:action),
|
26
|
+
mismatch: ->(_result) {}
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def handle(path_finder, match:, mismatch:)
|
33
|
+
case root = evaluate_matcher(path_finder)
|
34
|
+
when Wayfarer::Routing::RootRoute
|
35
|
+
case result = sub_result(root, path_finder)
|
36
|
+
when Wayfarer::Routing::Result::Match then match.call(result)
|
37
|
+
when Wayfarer::Routing::Result::Mismatch then mismatch.call(result)
|
38
|
+
else raise "invalid result: #{result.inspect}"
|
39
|
+
end
|
40
|
+
else raise "#{route.inspect} is not a root route"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def evaluate_matcher(path_finder)
|
45
|
+
path_finder[matcher] ||= matcher.evaluate(path_finder)
|
46
|
+
end
|
47
|
+
|
48
|
+
def sub_result(route, path_finder)
|
49
|
+
path_finder[route] ||= Wayfarer::Routing::PathFinder.sub_result(route, path_finder)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -2,6 +2,22 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Routing
|
5
|
-
class TargetRoute < Route
|
5
|
+
class TargetRoute < Route
|
6
|
+
def evaluate(_path_finder)
|
7
|
+
true
|
8
|
+
end
|
9
|
+
|
10
|
+
def target?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_h
|
15
|
+
{ action: case @action
|
16
|
+
when Wayfarer::Handler then { class: @action.class.name }
|
17
|
+
when Array then { handler: @action.first.class.name, action: @action.second }
|
18
|
+
else @action
|
19
|
+
end }
|
20
|
+
end
|
21
|
+
end
|
6
22
|
end
|
7
23
|
end
|
data/lib/wayfarer/stringify.rb
CHANGED
@@ -25,8 +25,7 @@ module Wayfarer
|
|
25
25
|
if self.class.stringified_attributes.any?
|
26
26
|
attrs = self.class
|
27
27
|
.stringified_attributes
|
28
|
-
.
|
29
|
-
.to_h
|
28
|
+
.to_h { |attr| [attr, public_send(attr)] }
|
30
29
|
.map { |k, v| "#{k}=#{v.inspect}" }
|
31
30
|
.join(", ")
|
32
31
|
|
data/lib/wayfarer/task.rb
CHANGED
@@ -6,20 +6,18 @@ module Wayfarer
|
|
6
6
|
# @!attribute [r] batch
|
7
7
|
# @return [String] the batch the task belongs to
|
8
8
|
class Task
|
9
|
-
|
9
|
+
include KV
|
10
10
|
include Stringify
|
11
11
|
|
12
|
-
attr_reader :url,
|
12
|
+
attr_reader :url,
|
13
|
+
:batch
|
13
14
|
|
14
15
|
stringify :url, :batch
|
15
16
|
|
16
|
-
delegate %i([] []=) => :@ephemeral
|
17
|
-
|
18
17
|
# @!visibility private
|
19
18
|
def initialize(url, batch)
|
20
19
|
@url = url
|
21
20
|
@batch = batch
|
22
|
-
@ephemeral = {}
|
23
21
|
end
|
24
22
|
|
25
23
|
# @!visibility private
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module URI
|
5
|
+
# HTTP(S) URL normalization.
|
6
|
+
module Normalization
|
7
|
+
InvalidURIError = Class.new(StandardError)
|
8
|
+
|
9
|
+
# Raised when URI is relative.
|
10
|
+
RelativeURIError = Class.new(InvalidURIError)
|
11
|
+
|
12
|
+
# Raised when URI scheme is not hypertext.
|
13
|
+
NoHypertextError = Class.new(InvalidURIError)
|
14
|
+
|
15
|
+
# Raised when URI has no host.
|
16
|
+
NoHostError = Class.new(InvalidURIError)
|
17
|
+
|
18
|
+
extend self
|
19
|
+
|
20
|
+
# Normalizes `uri` in-place.
|
21
|
+
# @param uri [Addressable::URI]
|
22
|
+
# @raise [InvalidURIError]
|
23
|
+
# @return [Addressable::URI]
|
24
|
+
def canonical!(uri)
|
25
|
+
had_no_path = uri.path.blank?
|
26
|
+
|
27
|
+
uri.normalize!
|
28
|
+
validate_uri!(uri)
|
29
|
+
|
30
|
+
normalize_host!(uri) if remove_www?
|
31
|
+
|
32
|
+
if remove_trailing_slash?
|
33
|
+
normalize_path!(uri)
|
34
|
+
root_path!(uri)
|
35
|
+
end
|
36
|
+
|
37
|
+
remove_fragment!(uri) if remove_fragment?
|
38
|
+
normalize_query_params!(uri)
|
39
|
+
|
40
|
+
root_path!(uri) if had_no_path && uri.query.nil?
|
41
|
+
|
42
|
+
uri
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def validate_uri!(uri)
|
48
|
+
raise RelativeURIError, "URL is not absolute" unless uri.absolute?
|
49
|
+
raise NoHypertextError, "URL is using unsupported protocol" unless supported_protocols.include?(uri.scheme)
|
50
|
+
raise NoHostError, "URL misses hostname" if uri.host.blank?
|
51
|
+
end
|
52
|
+
|
53
|
+
def normalize_query_params!(uri)
|
54
|
+
return unless remove_tracking_parameters? || order_query_parameters?
|
55
|
+
return unless (params = uri.query_values(Array))
|
56
|
+
|
57
|
+
remove_tracking_parameters!(params) if remove_tracking_parameters?
|
58
|
+
order_query_parameters!(params) if order_query_parameters?
|
59
|
+
|
60
|
+
uri.query_values = params.empty? ? nil : params
|
61
|
+
end
|
62
|
+
|
63
|
+
def remove_tracking_parameters!(params)
|
64
|
+
params.reject! { |key, val| val.to_s.empty? || tracking_params.include?(key) }
|
65
|
+
end
|
66
|
+
|
67
|
+
def order_query_parameters!(params)
|
68
|
+
params.sort_by!(&:first)
|
69
|
+
end
|
70
|
+
|
71
|
+
def normalize_host!(uri)
|
72
|
+
uri.host &&= uri.host.delete_prefix("www.")
|
73
|
+
end
|
74
|
+
|
75
|
+
def normalize_path!(uri)
|
76
|
+
uri.path = uri.path.delete_suffix(File::SEPARATOR) if uri.path && uri.path.length > 1
|
77
|
+
end
|
78
|
+
|
79
|
+
def remove_fragment!(uri)
|
80
|
+
uri.fragment = nil
|
81
|
+
end
|
82
|
+
|
83
|
+
def root_path!(uri)
|
84
|
+
uri.path = "" if uri.path == File::SEPARATOR
|
85
|
+
end
|
86
|
+
|
87
|
+
def normalization_config
|
88
|
+
Wayfarer.config.fetch(:normalization)
|
89
|
+
end
|
90
|
+
|
91
|
+
def supported_protocols
|
92
|
+
normalization_config.fetch(:schemes)
|
93
|
+
end
|
94
|
+
|
95
|
+
def tracking_params
|
96
|
+
normalization_config.fetch(:tracking_params)
|
97
|
+
end
|
98
|
+
|
99
|
+
def remove_www?
|
100
|
+
normalization_config.fetch(:remove_www)
|
101
|
+
end
|
102
|
+
|
103
|
+
def remove_trailing_slash?
|
104
|
+
normalization_config.fetch(:remove_trailing_slash)
|
105
|
+
end
|
106
|
+
|
107
|
+
def remove_fragment?
|
108
|
+
normalization_config.fetch(:remove_fragment)
|
109
|
+
end
|
110
|
+
|
111
|
+
def remove_tracking_parameters?
|
112
|
+
normalization_config.fetch(:remove_tracking_parameters)
|
113
|
+
end
|
114
|
+
|
115
|
+
def order_query_parameters?
|
116
|
+
normalization_config.fetch(:order_query_parameters)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
data/lib/wayfarer.rb
CHANGED
@@ -3,13 +3,16 @@
|
|
3
3
|
require "cgi"
|
4
4
|
require "forwardable"
|
5
5
|
require "net/http"
|
6
|
+
require "pp"
|
6
7
|
require "securerandom"
|
7
8
|
require "uri"
|
9
|
+
require "yaml"
|
8
10
|
|
9
11
|
require "active_job"
|
12
|
+
require "active_support/core_ext/array/wrap"
|
13
|
+
require "active_support/core_ext/object/deep_dup"
|
10
14
|
require "capybara"
|
11
15
|
require "connection_pool"
|
12
|
-
require "docile"
|
13
16
|
require "ferrum"
|
14
17
|
require "metainspector"
|
15
18
|
require "mime/types"
|
@@ -18,7 +21,6 @@ require "mock_redis"
|
|
18
21
|
require "mustermann"
|
19
22
|
require "net/http/persistent"
|
20
23
|
require "nokogiri"
|
21
|
-
require "normalize_url"
|
22
24
|
require "selenium-webdriver"
|
23
25
|
require "redis"
|
24
26
|
require "thor"
|
@@ -28,32 +30,71 @@ loader = Zeitwerk::Loader.for_gem
|
|
28
30
|
loader.inflector.inflect("cli" => "CLI",
|
29
31
|
"dsl" => "DSL",
|
30
32
|
"http" => "HTTP",
|
33
|
+
"uri" => "URI",
|
31
34
|
"url" => "URL",
|
32
35
|
"xml" => "XML",
|
33
36
|
"json" => "JSON",
|
34
|
-
"gc" => "GC"
|
37
|
+
"gc" => "GC",
|
38
|
+
"kv" => "KV")
|
35
39
|
loader.setup
|
36
40
|
|
37
41
|
module Wayfarer
|
38
42
|
module VERSION
|
39
43
|
MAJOR = 0
|
40
44
|
MINOR = 4
|
41
|
-
TINY =
|
45
|
+
TINY = 8
|
42
46
|
STRING = [MAJOR, MINOR, TINY].join(".")
|
43
47
|
end
|
44
48
|
|
45
49
|
DEFAULT_CONFIG = {
|
46
50
|
redis: {
|
47
51
|
url: "redis://localhost:6379/0",
|
48
|
-
factory: ->(
|
52
|
+
factory: ->(redis_config) { ::Redis.new(url: redis_config.fetch(:url)) }
|
49
53
|
},
|
50
54
|
network: {
|
51
55
|
agent: :http,
|
52
|
-
|
53
|
-
|
56
|
+
pool: {
|
57
|
+
size: 1,
|
58
|
+
timeout: 10
|
59
|
+
},
|
54
60
|
http_headers: {},
|
55
61
|
renew_on: []
|
56
62
|
},
|
63
|
+
parsing: {
|
64
|
+
registry: {
|
65
|
+
"application/json" => Wayfarer::Parsing::JSON,
|
66
|
+
"text/html" => [Wayfarer::Parsing::XML, :html],
|
67
|
+
"application/xml" => [Wayfarer::Parsing::XML, :xml]
|
68
|
+
}
|
69
|
+
},
|
70
|
+
normalization: {
|
71
|
+
remove_www: true,
|
72
|
+
remove_trailing_slash: true,
|
73
|
+
remove_fragment: true,
|
74
|
+
remove_tracking_parameters: true,
|
75
|
+
order_query_parameters: true,
|
76
|
+
schemes: %w[
|
77
|
+
http
|
78
|
+
https
|
79
|
+
].to_set,
|
80
|
+
tracking_params: %w[
|
81
|
+
utm_source
|
82
|
+
utm_medium
|
83
|
+
utm_term
|
84
|
+
utm_content
|
85
|
+
utm_campaign
|
86
|
+
gclid
|
87
|
+
fbclid
|
88
|
+
msclkid
|
89
|
+
sms_ss
|
90
|
+
awesm
|
91
|
+
xtor
|
92
|
+
PHPSESSID
|
93
|
+
].to_set
|
94
|
+
},
|
95
|
+
deduplication: {
|
96
|
+
key: ->(task) { task[:uri].to_s }
|
97
|
+
},
|
57
98
|
capybara: {
|
58
99
|
driver: nil
|
59
100
|
},
|
@@ -62,12 +103,11 @@ module Wayfarer
|
|
62
103
|
},
|
63
104
|
selenium: {
|
64
105
|
driver: :chrome,
|
65
|
-
options: {}
|
66
|
-
client_timeout: 60
|
106
|
+
options: {}
|
67
107
|
}
|
68
108
|
}.freeze
|
69
109
|
|
70
|
-
mattr_accessor :config, default: DEFAULT_CONFIG.
|
110
|
+
mattr_accessor :config, default: DEFAULT_CONFIG.deep_dup
|
71
111
|
|
72
112
|
UserAgentTimeoutError = Class.new(StandardError) # TODO: Move to Networking namespace
|
73
113
|
end
|
data/mise.toml
ADDED
data/mkdocs.yml
CHANGED
@@ -7,6 +7,7 @@ markdown_extensions:
|
|
7
7
|
- attr_list
|
8
8
|
- meta
|
9
9
|
- def_list
|
10
|
+
- pymdownx.snippets
|
10
11
|
- pymdownx.details
|
11
12
|
- pymdownx.highlight
|
12
13
|
- pymdownx.inlinehilite
|
@@ -19,7 +20,8 @@ markdown_extensions:
|
|
19
20
|
- pymdownx.caret
|
20
21
|
- pymdownx.mark
|
21
22
|
- pymdownx.tilde
|
22
|
-
- pymdownx.tabbed
|
23
|
+
- pymdownx.tabbed:
|
24
|
+
alternate_style: true
|
23
25
|
- pymdownx.tasklist:
|
24
26
|
custom_checkbox: true
|
25
27
|
|
@@ -56,21 +58,14 @@ nav:
|
|
56
58
|
- Home: index.md
|
57
59
|
- Guides:
|
58
60
|
- Tutorial: guides/tutorial.md
|
59
|
-
- Jobs:
|
60
|
-
- Overview: guides/jobs.md
|
61
|
-
- Error handling: guides/jobs/error_handling.md
|
61
|
+
- Jobs: guides/jobs.md
|
62
62
|
- Tasks: guides/tasks.md
|
63
63
|
- Pages: guides/pages.md
|
64
|
-
- Routing:
|
65
|
-
- Overview: guides/routing.md
|
66
|
-
- Matchers:
|
67
|
-
- URL: todo
|
68
|
-
- Host: todo
|
69
|
-
- Path: todo
|
70
|
-
- Query: todo
|
71
|
-
- Custom matchers: todo
|
64
|
+
- Routing: guides/routing.md
|
72
65
|
- Callbacks: guides/callbacks.md
|
73
66
|
- Handlers: guides/handlers.md
|
67
|
+
- Configuration: guides/configuration.md
|
68
|
+
- Command-line interface: guides/cli.md
|
74
69
|
- Networking:
|
75
70
|
- Introduction: guides/user_agents.md
|
76
71
|
- User agent API: guides/networking/custom_adapters.md
|
@@ -80,13 +75,9 @@ nav:
|
|
80
75
|
- Selenium: guides/networking/selenium.md
|
81
76
|
- Capybara: guides/networking/capybara.md
|
82
77
|
- Redis: guides/redis.md
|
83
|
-
-
|
84
|
-
- Reference:
|
85
|
-
- Configuration: reference/configuration.md
|
86
|
-
- Command-line interface: reference/cli.md
|
78
|
+
- Development: guides/development.md
|
87
79
|
- Cookbook:
|
88
80
|
- Browser navigation: cookbook/navigation.md
|
89
81
|
- Executing JavaScript: cookbook/executing_javascript.md
|
90
82
|
- Screenhots: cookbook/screenshots.md
|
91
|
-
- Setting the User-Agent: cookbook/user_agent.md
|
92
83
|
- API documentation: "https://www.rubydoc.info"
|
data/rake/lint.rake
CHANGED
@@ -1,105 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "open-uri"
|
4
|
-
require "fileutils"
|
5
|
-
require "rubygems/package"
|
6
|
-
require "zlib"
|
7
|
-
require "zip"
|
8
3
|
require "rubocop/rake_task"
|
9
|
-
require "net/http"
|
10
|
-
require "uri"
|
11
|
-
|
12
|
-
VALE_URL_ROOT = "https://github.com/errata-ai"
|
13
|
-
VALE_VERSION = "2.15.4"
|
14
|
-
VALE_STYLE_GUIDE_VERSION = "v0.3.3"
|
15
|
-
VALE_PREFIX = File.expand_path(File.join(".github", "bin"))
|
16
|
-
VALE_STYLE_GUIDE_PREFIX = File.expand_path(File.join(".github", "share"))
|
17
4
|
|
18
5
|
namespace :lint do
|
19
6
|
RuboCop::RakeTask.new do |task|
|
20
7
|
task.formatters = %w[simple]
|
21
8
|
end
|
22
|
-
|
23
|
-
vale_command = File.join(VALE_PREFIX, "vale")
|
24
|
-
style_guide_directory = File.join(VALE_STYLE_GUIDE_PREFIX, "vale")
|
25
|
-
|
26
|
-
desc "Lint documentation"
|
27
|
-
task vale: [:"lint:vale:clean", vale_command, style_guide_directory] do
|
28
|
-
sh "#{vale_command} docs/**/*.md"
|
29
|
-
end
|
30
|
-
|
31
|
-
namespace :vale do
|
32
|
-
directory VALE_PREFIX
|
33
|
-
|
34
|
-
desc "Install Vale to #{vale_command}"
|
35
|
-
file vale_command => VALE_PREFIX do
|
36
|
-
unless (local_command = `which vale`.strip).empty?
|
37
|
-
next File.symlink(local_command, vale_command)
|
38
|
-
end
|
39
|
-
|
40
|
-
filename = "vale_#{VALE_VERSION}_Linux_64-bit.tar.gz"
|
41
|
-
url = File.join(VALE_URL_ROOT, "/vale/releases/download/v#{VALE_VERSION}/#{filename}")
|
42
|
-
|
43
|
-
extract_tar_gz(url, vale_command)
|
44
|
-
|
45
|
-
FileUtils.chmod("+x", vale_command)
|
46
|
-
end
|
47
|
-
|
48
|
-
desc "Deletes Vale"
|
49
|
-
task clean: :"lint:vale:style_guide:clean" do
|
50
|
-
File.delete(vale_command) if File.exist?(vale_command)
|
51
|
-
end
|
52
|
-
|
53
|
-
namespace :style_guide do
|
54
|
-
directory VALE_STYLE_GUIDE_PREFIX
|
55
|
-
|
56
|
-
desc "Retrieve Vale Google style guide #{VALE_STYLE_GUIDE_VERSION}"
|
57
|
-
directory style_guide_directory => VALE_STYLE_GUIDE_PREFIX do
|
58
|
-
FileUtils.mkdir_p(style_guide_directory)
|
59
|
-
url = "https://github.com/errata-ai/Google/releases/download/#{VALE_STYLE_GUIDE_VERSION}/Google.zip"
|
60
|
-
extract_zip(url, style_guide_directory)
|
61
|
-
end
|
62
|
-
|
63
|
-
desc "Deletes the Vale Google style guide"
|
64
|
-
task :clean do
|
65
|
-
FileUtils.rm_rf([style_guide_directory])
|
66
|
-
end
|
67
|
-
|
68
|
-
private
|
69
|
-
|
70
|
-
def extract_zip(url, destination)
|
71
|
-
content = URI.open(url)
|
72
|
-
|
73
|
-
Zip::File.open_buffer(content) do |zip|
|
74
|
-
zip.each do |entry|
|
75
|
-
path = File.join(destination, entry.name)
|
76
|
-
|
77
|
-
if entry.directory?
|
78
|
-
FileUtils.mkdir_p(path)
|
79
|
-
else
|
80
|
-
entry.extract(path)
|
81
|
-
File.chmod(0o755, path)
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
private
|
89
|
-
|
90
|
-
def extract_tar_gz(url, destination)
|
91
|
-
URI.open(url) do |file|
|
92
|
-
Zlib::GzipReader.open(file) do |gz|
|
93
|
-
Gem::Package::TarReader.new(gz) do |tar|
|
94
|
-
tar.each do |entry|
|
95
|
-
next unless entry.file?
|
96
|
-
|
97
|
-
binary = entry.read
|
98
|
-
File.write(destination, binary)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
9
|
end
|
data/rake/release.rake
CHANGED
@@ -13,17 +13,11 @@ namespace :release do
|
|
13
13
|
raise "Gem version #{gem_version} deviates from library version #{lib_version}" unless gem_version == lib_version
|
14
14
|
end
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
key = ENV.fetch("GEM_HOST_API_KEY") { raise "`GEM_HOST_API_KEY` is unset" }
|
21
|
-
contents = YAML.dump(rubygems_api_key: "Basic #{key}")
|
22
|
-
gem_path = File.join(Dir.home, ".gem")
|
23
|
-
FileUtils.mkdir_p(gem_path)
|
24
|
-
File.write(File.join(gem_path, "credentials"), contents)
|
16
|
+
task :guard_debug do
|
17
|
+
sh "git diff --exit-code"
|
18
|
+
sh "git diff-index --quiet --cached HEAD"
|
25
19
|
end
|
26
20
|
end
|
27
21
|
|
28
|
-
Rake::Task[:release].enhance(%i[release:guard_versions])
|
29
|
-
Rake::Task[:"release:
|
22
|
+
Rake::Task[:"release:rubygem_push"].enhance(%i[release:guard_versions])
|
23
|
+
Rake::Task[:"release:guard_clean"].enhance(%i[release:guard_debug])
|