wayfarer 0.4.6 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env +17 -0
- data/.github/workflows/lint.yaml +27 -0
- data/.github/workflows/release.yaml +30 -0
- data/.github/workflows/tests.yaml +21 -0
- data/.gitignore +5 -1
- data/.rubocop.yml +36 -0
- data/.vale.ini +8 -0
- data/.yardopts +1 -3
- data/Dockerfile +6 -4
- data/Gemfile +24 -0
- data/Gemfile.lock +274 -164
- data/Rakefile +7 -51
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +23 -13
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/callbacks.md +25 -125
- data/docs/guides/cli.md +71 -0
- data/docs/guides/configuration.md +10 -35
- data/docs/guides/development.md +67 -0
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs.md +142 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +103 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +78 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +156 -0
- data/docs/guides/tasks.md +53 -9
- data/docs/guides/tutorial.md +66 -0
- data/docs/guides/user_agents.md +115 -0
- data/docs/index.md +17 -40
- data/lib/wayfarer/base.rb +125 -46
- data/lib/wayfarer/batch_completion.rb +60 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +85 -89
- data/lib/wayfarer/cli.rb +103 -0
- data/lib/wayfarer/gc.rb +18 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/kv.rb +28 -0
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/chain.rb +7 -1
- data/lib/wayfarer/middleware/content_type.rb +59 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +22 -13
- data/lib/wayfarer/middleware/dispatch.rb +17 -4
- data/lib/wayfarer/middleware/normalize.rb +7 -14
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +31 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +14 -3
- data/lib/wayfarer/networking/ferrum.rb +1 -4
- data/lib/wayfarer/networking/follow.rb +14 -7
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +23 -13
- data/lib/wayfarer/networking/selenium.rb +15 -7
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +34 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +21 -0
- data/lib/wayfarer/redis/barrier.rb +26 -21
- data/lib/wayfarer/redis/counter.rb +18 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +166 -30
- data/lib/wayfarer/routing/hash_stack.rb +33 -0
- data/lib/wayfarer/routing/matchers/custom.rb +8 -5
- data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
- data/lib/wayfarer/routing/matchers/host.rb +15 -9
- data/lib/wayfarer/routing/matchers/path.rb +11 -31
- data/lib/wayfarer/routing/matchers/query.rb +41 -17
- data/lib/wayfarer/routing/matchers/result.rb +12 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
- data/lib/wayfarer/routing/matchers/url.rb +13 -5
- data/lib/wayfarer/routing/path_consumer.rb +130 -0
- data/lib/wayfarer/routing/path_finder.rb +151 -23
- data/lib/wayfarer/routing/result.rb +1 -1
- data/lib/wayfarer/routing/root_route.rb +17 -1
- data/lib/wayfarer/routing/route.rb +66 -19
- data/lib/wayfarer/routing/serializable.rb +28 -0
- data/lib/wayfarer/routing/sub_route.rb +53 -0
- data/lib/wayfarer/routing/target_route.rb +17 -1
- data/lib/wayfarer/stringify.rb +21 -30
- data/lib/wayfarer/task.rb +9 -17
- data/lib/wayfarer/uri/normalization.rb +120 -0
- data/lib/wayfarer.rb +72 -5
- data/mise.toml +2 -0
- data/mkdocs.yml +44 -8
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +9 -0
- data/rake/release.rake +23 -0
- data/rake/tests.rake +32 -0
- data/requirements.txt +1 -1
- data/spec/factories/job.rb +8 -0
- data/spec/factories/middleware.rb +2 -2
- data/spec/factories/path_finder.rb +11 -0
- data/spec/factories/redis.rb +19 -0
- data/spec/factories/task.rb +46 -2
- data/spec/spec_helpers.rb +55 -51
- data/spec/support/active_job_helpers.rb +8 -0
- data/spec/support/integration_helpers.rb +21 -0
- data/spec/support/redis_helpers.rb +9 -0
- data/spec/support/test_app.rb +66 -37
- data/spec/wayfarer/base_spec.rb +200 -0
- data/spec/wayfarer/batch_completion_spec.rb +142 -0
- data/spec/wayfarer/cli/job_spec.rb +88 -0
- data/spec/wayfarer/cli/routing_spec.rb +322 -0
- data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
- data/spec/wayfarer/gc_spec.rb +29 -0
- data/spec/wayfarer/handler_spec.rb +9 -0
- data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
- data/spec/wayfarer/integration/content_type_spec.rb +37 -0
- data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
- data/spec/wayfarer/integration/gc_spec.rb +40 -0
- data/spec/wayfarer/integration/handler_spec.rb +65 -0
- data/spec/wayfarer/integration/page_spec.rb +79 -0
- data/spec/wayfarer/integration/params_spec.rb +64 -0
- data/spec/wayfarer/integration/parsing_spec.rb +99 -0
- data/spec/wayfarer/integration/retry_spec.rb +112 -0
- data/spec/wayfarer/integration/stage_spec.rb +58 -0
- data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
- data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
- data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
- data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
- data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
- data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
- data/spec/wayfarer/middleware/router_spec.rb +102 -0
- data/spec/wayfarer/middleware/stage_spec.rb +63 -0
- data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
- data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
- data/spec/wayfarer/networking/capybara_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
- data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
- data/spec/wayfarer/networking/http_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
- data/spec/wayfarer/networking/selenium_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
- data/spec/wayfarer/page_spec.rb +69 -0
- data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
- data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
- data/spec/wayfarer/redis/barrier_spec.rb +39 -0
- data/spec/wayfarer/redis/counter_spec.rb +34 -0
- data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
- data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
- data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
- data/spec/wayfarer/routing/integration_spec.rb +101 -0
- data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
- data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
- data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
- data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
- data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
- data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
- data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
- data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
- data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
- data/spec/wayfarer/routing/root_route_spec.rb +51 -0
- data/spec/wayfarer/routing/route_spec.rb +74 -0
- data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
- data/spec/wayfarer/task_spec.rb +13 -0
- data/spec/wayfarer/uri/normalization_spec.rb +98 -0
- data/spec/wayfarer_spec.rb +2 -2
- data/wayfarer.gemspec +18 -28
- metadata +797 -265
- data/.github/workflows/ci.yaml +0 -32
- data/.rbenv-gemsets +0 -1
- data/.ruby-version +0 -1
- data/RELEASING.md +0 -17
- data/docs/cookbook/user_agent.md +0 -7
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/cli.md +0 -61
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/base_spec.rb +0 -224
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/cli/job_spec.rb +0 -78
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/fixtures/dummy_job.rb +0 -7
- data/spec/gc_spec.rb +0 -59
- data/spec/handler_spec.rb +0 -11
- data/spec/integration/callbacks_spec.rb +0 -85
- data/spec/integration/page_spec.rb +0 -62
- data/spec/integration/params_spec.rb +0 -56
- data/spec/integration/stage_spec.rb +0 -51
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/middleware/dedup_spec.rb +0 -88
- data/spec/middleware/dispatch_spec.rb +0 -43
- data/spec/middleware/fetch_spec.rb +0 -155
- data/spec/middleware/normalize_spec.rb +0 -29
- data/spec/middleware/router_spec.rb +0 -105
- data/spec/middleware/stage_spec.rb +0 -62
- data/spec/networking/capybara_spec.rb +0 -12
- data/spec/networking/ferrum_spec.rb +0 -12
- data/spec/networking/http_spec.rb +0 -12
- data/spec/networking/selenium_spec.rb +0 -12
- data/spec/page_spec.rb +0 -47
- data/spec/parsing/xml_spec.rb +0 -25
- data/spec/redis/barrier_spec.rb +0 -78
- data/spec/redis/counter_spec.rb +0 -32
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/integration_spec.rb +0 -110
- data/spec/routing/matchers/custom_spec.rb +0 -31
- data/spec/routing/matchers/host_spec.rb +0 -49
- data/spec/routing/matchers/path_spec.rb +0 -43
- data/spec/routing/matchers/query_spec.rb +0 -137
- data/spec/routing/matchers/scheme_spec.rb +0 -25
- data/spec/routing/matchers/suffix_spec.rb +0 -41
- data/spec/routing/matchers/uri_spec.rb +0 -27
- data/spec/routing/path_finder_spec.rb +0 -33
- data/spec/routing/root_route_spec.rb +0 -29
- data/spec/routing/route_spec.rb +0 -43
- data/spec/routing/router_spec.rb +0 -24
- data/spec/task_spec.rb +0 -34
- data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
|
10
10
|
def create
|
11
11
|
Net::HTTP::Persistent.new(name: CONNECTION_NAME).tap do |conn|
|
12
|
-
Wayfarer.config
|
12
|
+
Wayfarer.config[:network][:http_headers].each do |key, val|
|
13
13
|
conn.override_headers[key] = val
|
14
14
|
end
|
15
15
|
end
|
@@ -5,25 +5,35 @@ module Wayfarer
|
|
5
5
|
class Pool
|
6
6
|
include Singleton
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class_attribute :finalizer,
|
9
|
+
default: ->(pool) { pool.shutdown(&:renew) }.freeze,
|
10
|
+
instance_accessor: false,
|
11
|
+
instance_predicate: false
|
12
|
+
|
13
|
+
class_attribute :registry,
|
14
|
+
default: { http: HTTP,
|
15
|
+
ferrum: Ferrum,
|
16
|
+
selenium: Selenium,
|
17
|
+
capybara: Capybara },
|
18
|
+
instance_accessor: false,
|
19
|
+
instance_predicate: false
|
20
|
+
|
21
|
+
attr_reader :pool
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@pool = ConnectionPool.new(**Wayfarer.config.dig(:network, :pool), &method(:context))
|
25
|
+
|
26
|
+
at_exit { free }
|
17
27
|
end
|
18
28
|
|
19
|
-
def with(&
|
20
|
-
pool.with(&
|
29
|
+
def with(&)
|
30
|
+
@pool.with(&)
|
21
31
|
rescue ConnectionPool::TimeoutError => e
|
22
32
|
raise Wayfarer::UserAgentTimeoutError, e
|
23
33
|
end
|
24
34
|
|
25
35
|
def free
|
26
|
-
|
36
|
+
self.class.finalizer.call(@pool)
|
27
37
|
end
|
28
38
|
|
29
39
|
private
|
@@ -33,7 +43,7 @@ module Wayfarer
|
|
33
43
|
end
|
34
44
|
|
35
45
|
def strategy
|
36
|
-
self.class.registry[Wayfarer.config.network
|
46
|
+
self.class.registry[Wayfarer.config.dig(:network, :agent)].new
|
37
47
|
end
|
38
48
|
end
|
39
49
|
end
|
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
MOCK_RESPONSE_HEADERS = {}.freeze
|
10
10
|
|
11
11
|
def create
|
12
|
-
::Selenium::WebDriver.for(
|
12
|
+
::Selenium::WebDriver.for(driver, options)
|
13
13
|
end
|
14
14
|
|
15
15
|
def destroy(instance)
|
@@ -29,15 +29,23 @@ module Wayfarer
|
|
29
29
|
|
30
30
|
private
|
31
31
|
|
32
|
-
def
|
33
|
-
Wayfarer.config.
|
32
|
+
def driver
|
33
|
+
Wayfarer.config.dig(:selenium, :driver)
|
34
34
|
end
|
35
35
|
|
36
|
-
def
|
37
|
-
|
38
|
-
client.read_timeout = Wayfarer.config.selenium.client_timeout
|
39
|
-
end
|
36
|
+
def options
|
37
|
+
Wayfarer.config.dig(:selenium, :options)
|
40
38
|
end
|
39
|
+
|
40
|
+
# def options
|
41
|
+
# Wayfarer.config[:selenium][:options].merge(http_client: http_client)
|
42
|
+
# end
|
43
|
+
|
44
|
+
# def http_client
|
45
|
+
# ::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
|
46
|
+
# client.read_timeout = Wayfarer.config[:selenium][:client_timeout]
|
47
|
+
# end
|
48
|
+
# end
|
41
49
|
end
|
42
50
|
end
|
43
51
|
end
|
@@ -13,13 +13,13 @@ module Wayfarer
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def navigate(_instance, _url)
|
16
|
-
raise
|
16
|
+
raise NotImplementedError
|
17
17
|
end
|
18
18
|
|
19
19
|
def live(_instance); end
|
20
20
|
|
21
21
|
def create
|
22
|
-
raise
|
22
|
+
raise NotImplementedError
|
23
23
|
end
|
24
24
|
|
25
25
|
def destroy(_instance); end
|
data/lib/wayfarer/page.rb
CHANGED
@@ -1,12 +1,22 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
+
# @!attribute [r] url
|
5
|
+
# @return [String] the URL that was fetched
|
6
|
+
# @!attribute [r] status_code
|
7
|
+
# @return [Fixnum] HTTP status code
|
8
|
+
# @!attribute [r] body
|
9
|
+
# @return [String] the body of the response
|
10
|
+
# @!attribute [r] headers
|
11
|
+
# @return [Hash] the headers of the response
|
12
|
+
# @note HTTP header keys are downcased, for example: `content-type`.
|
4
13
|
class Page
|
5
14
|
attr_reader :url,
|
6
15
|
:status_code,
|
7
16
|
:body,
|
8
17
|
:headers
|
9
18
|
|
19
|
+
# @!visibility private
|
10
20
|
def initialize(url:, status_code:, body:, headers:)
|
11
21
|
@url = url
|
12
22
|
@status_code = status_code
|
@@ -14,24 +24,34 @@ module Wayfarer
|
|
14
24
|
@headers = headers.transform_keys(&:downcase)
|
15
25
|
end
|
16
26
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
content_type = headers["content-type"]
|
24
|
-
sub_type = MIME::Types[content_type].first.sub_type
|
27
|
+
# Returns the MIME type of the response.
|
28
|
+
# @return [MIME::Type]
|
29
|
+
# @see https://www.rubydoc.info/gems/mime-types/MIME/Type
|
30
|
+
def mime_type
|
31
|
+
@mime_type ||= MIME::Types[content_type]&.first
|
32
|
+
end
|
25
33
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
34
|
+
# Returns a parsed representation of the HTTP response or the browser DOM,
|
35
|
+
# depending on the Content-Type.
|
36
|
+
# @return [Nokogiri::HTML::Document] when Content-Type is `text/html`
|
37
|
+
# @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document Nokogiri::HTML::Document
|
38
|
+
# @return [Nokogiri::XML::Document] when Content-Type is `text/xml`
|
39
|
+
# @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document Nokogiri::XML::Document
|
40
|
+
# @return [Hash] when Content-Type is `application/json`
|
41
|
+
# @note You can register custom parsers with {Wayfarer::Parsing.registry}.
|
42
|
+
def doc
|
43
|
+
@doc ||= Wayfarer::Parsing.parse(body, mime_type&.content_type || content_type)
|
31
44
|
end
|
32
45
|
|
46
|
+
# Returns a `MetaInspector::Document`.
|
47
|
+
# @return [MetaInspector::Document]
|
48
|
+
# @see https://www.rubydoc.info/gems/metainspector/MetaInspector/Document
|
33
49
|
def meta
|
34
|
-
@meta ||= MetaInspector.new(url, document: body)
|
50
|
+
@meta ||= MetaInspector.new(url, document: body, headers: headers, normalize_url: false)
|
51
|
+
end
|
52
|
+
|
53
|
+
def content_type
|
54
|
+
@content_type ||= headers["content-type"]
|
35
55
|
end
|
36
56
|
end
|
37
57
|
end
|
data/lib/wayfarer/parsing/xml.rb
CHANGED
@@ -5,12 +5,12 @@ module Wayfarer
|
|
5
5
|
module XML
|
6
6
|
module_function
|
7
7
|
|
8
|
-
def
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
def parse(xml, variant)
|
9
|
+
case variant
|
10
|
+
when :xml then Nokogiri::XML(xml)
|
11
|
+
when :html then Nokogiri::HTML(xml)
|
12
|
+
else raise ArgumentError, "Unknown type: #{variant}"
|
13
|
+
end
|
14
14
|
end
|
15
15
|
end
|
16
16
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# @!scope class
|
5
|
+
# @!attribute [r] registry
|
6
|
+
# @return [Hash] Mapping of Content-Type to parser.
|
7
|
+
module Parsing
|
8
|
+
# @!visibility private
|
9
|
+
FALLBACK_CONTENT_TYPE = "application/octet-stream"
|
10
|
+
|
11
|
+
module_function
|
12
|
+
|
13
|
+
# @!visibility private
|
14
|
+
def parse(body, content_type = FALLBACK_CONTENT_TYPE)
|
15
|
+
parser, args = Wayfarer.config.dig(:parsing, :registry, content_type)
|
16
|
+
return unless parser
|
17
|
+
|
18
|
+
parser.parse(body, *args)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -2,35 +2,40 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Redis
|
5
|
-
Barrier
|
6
|
-
|
5
|
+
# A Barrier prevents processing the same key more than once.
|
6
|
+
# It marks keys in a Redis hash as they are encountered.
|
7
|
+
# Once a key is added, it cannot get removed.
|
8
|
+
class Barrier
|
9
|
+
include Resettable
|
7
10
|
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
+
# @return [Hash] the task configuration, including redis_pool and batch name
|
12
|
+
attr_reader :task
|
11
13
|
|
12
|
-
|
13
|
-
redis { |conn| conn.del(redis_key) }
|
14
|
-
end
|
14
|
+
VALUE = ""
|
15
15
|
|
16
|
-
|
17
|
-
|
16
|
+
# @param task [Wayfarer::Task] task context
|
17
|
+
def initialize(task)
|
18
|
+
@task = task
|
19
|
+
@redis_pool = task[:redis_pool]
|
18
20
|
end
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
# SMISMEMBER is only supported on Redis >= 6.2.0
|
24
|
-
if major > 6 || (major == 6 && minor >= 2)
|
25
|
-
redis { |conn| conn.smismember(redis_key, urls) }.map { |val| val == 1 }
|
26
|
-
else
|
27
|
-
urls.map { |url| redis { |conn| conn.sismember(redis_key, url) } }
|
28
|
-
end
|
22
|
+
# @return [String] the Redis key for this barrier
|
23
|
+
def redis_key
|
24
|
+
"wayfarer-barrier-#{task.batch}"
|
29
25
|
end
|
30
26
|
|
31
|
-
|
32
|
-
|
27
|
+
# Checks if a key has already been passed through the barrier.
|
28
|
+
#
|
29
|
+
# @param key [String] the key to check
|
30
|
+
# @return [Boolean] true if the key has already been seen, false otherwise
|
31
|
+
def check!(key)
|
32
|
+
!redis_pool.with { |conn| conn.hsetnx(redis_key, key, VALUE) }
|
33
33
|
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# @return [ConnectionPool] the Redis connection pool
|
38
|
+
attr_reader :redis_pool
|
34
39
|
end
|
35
40
|
end
|
36
41
|
end
|
@@ -2,28 +2,37 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Redis
|
5
|
-
Counter
|
6
|
-
include
|
5
|
+
class Counter
|
6
|
+
include Resettable
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
attr_reader :task
|
9
|
+
|
10
|
+
def initialize(task)
|
11
|
+
@task = task
|
12
|
+
@redis_pool = task[:redis_pool]
|
10
13
|
end
|
11
14
|
|
12
|
-
def
|
13
|
-
|
15
|
+
def redis_key
|
16
|
+
"wayfarer-counter-#{@task.batch}"
|
14
17
|
end
|
15
18
|
|
16
19
|
def value
|
17
|
-
|
20
|
+
redis_pool.with { |conn| conn.get(redis_key) }.to_i
|
18
21
|
end
|
19
22
|
|
20
23
|
def increment
|
21
|
-
|
24
|
+
redis_pool.with { |conn| conn.incr(redis_key) }
|
22
25
|
end
|
23
26
|
|
24
27
|
def decrement
|
25
|
-
|
28
|
+
redis_pool.with { |conn| conn.decr(redis_key) }.tap do |val|
|
29
|
+
@callback&.call if val == 0
|
30
|
+
end
|
26
31
|
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
attr_reader :redis_pool
|
27
36
|
end
|
28
37
|
end
|
29
38
|
end
|
data/lib/wayfarer/redis/pool.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Redis
|
5
|
+
module Resettable
|
6
|
+
def reset!
|
7
|
+
redis_pool.with { |conn| conn.del(redis_key) }
|
8
|
+
end
|
9
|
+
|
10
|
+
def redis_pool
|
11
|
+
raise NotImplementedError
|
12
|
+
end
|
13
|
+
|
14
|
+
def redis_key
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/wayfarer/routing/dsl.rb
CHANGED
@@ -2,56 +2,192 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Routing
|
5
|
+
# Routing DSL that declares a tree of {Route}s. Routing trees decide
|
6
|
+
# whether a URL gets processed. Each route has a matcher which is a
|
7
|
+
# predicate. Routes are searched depth-first by a {PathFinder}.
|
8
|
+
#
|
9
|
+
# When you call a DSL method on a route, you declare a child route on the
|
10
|
+
# route. You can pass a keyword list and a block when you call DSL methods.
|
11
|
+
# Keyword lists result in route chains. For example, if you declare:
|
12
|
+
#
|
13
|
+
# ```ruby
|
14
|
+
# route.host "example.com", path: ":foo", query: { page: 1 }
|
15
|
+
# ```
|
16
|
+
#
|
17
|
+
# The last route with the query matcher is returned, and the hierarchy is
|
18
|
+
# `host > path > query`.
|
19
|
+
#
|
20
|
+
# To append multiple child rules, pass a block within which you can call
|
21
|
+
# DSL methods to declare child routes:
|
22
|
+
#
|
23
|
+
# ```ruby
|
24
|
+
# route.host "example.com" do
|
25
|
+
# path ":foo"
|
26
|
+
# query page: 1
|
27
|
+
# end
|
28
|
+
# ```
|
29
|
+
#
|
30
|
+
# The query matcher is returned again, but the hierarchy is
|
31
|
+
# `host > path`, `host > query`.
|
32
|
+
#
|
33
|
+
# @see Route
|
34
|
+
# @see PathFinder
|
5
35
|
module DSL
|
6
|
-
|
7
|
-
|
36
|
+
# Match URLs exactly.
|
37
|
+
#
|
38
|
+
# A trailing slash in +url+ is ignored so
|
39
|
+
# "https://example.com" and "https://example.com/" are equivalent.
|
40
|
+
#
|
41
|
+
# This matcher doesn't collect `params`.
|
42
|
+
#
|
43
|
+
# @param url [String]
|
44
|
+
# @param options [Hash]
|
45
|
+
# @yield [route]
|
46
|
+
# @return [Wayfarer::Routing::Route]
|
47
|
+
#
|
48
|
+
# @example Match URL exactly
|
49
|
+
# route.url("https://www.iana.org/help/example-domains").to(:index)
|
50
|
+
def url(url, **, &)
|
51
|
+
child_route(Matchers::URL.new(url), **, &)
|
8
52
|
end
|
9
53
|
|
10
|
-
|
11
|
-
|
54
|
+
# Match hostnames excluding the port number.
|
55
|
+
#
|
56
|
+
# * `String` is compared literally
|
57
|
+
# * `Regexp` is matched against the host
|
58
|
+
#
|
59
|
+
# This matcher doesn't collect `params`.
|
60
|
+
#
|
61
|
+
# @param host [String, Regexp]
|
62
|
+
# @param options [Hash]
|
63
|
+
# @yield [route]
|
64
|
+
# @return [Wayfarer::Routing::Route]
|
65
|
+
#
|
66
|
+
# @example Literal host
|
67
|
+
# route.host("example.com").to(:home)
|
68
|
+
# @example Regular expression
|
69
|
+
# route.host(/example\.com/) do
|
70
|
+
# path "users/:id", to: :user
|
71
|
+
# end
|
72
|
+
def host(host, **, &)
|
73
|
+
child_route(Matchers::Host.new(host), **, &)
|
12
74
|
end
|
13
75
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
76
|
+
# Match and consume path fragments. You can use Sinatra-style pattern
|
77
|
+
# matching to extract data from segments.
|
78
|
+
#
|
79
|
+
# @see https://github.com/sinatra/mustermann/blob/main/mustermann/README.md#-sinatra-pattern
|
80
|
+
#
|
81
|
+
# This matcher doesn't collect `params`.
|
82
|
+
# A leading slash is enforced on the path string.
|
83
|
+
#
|
84
|
+
# @example Capture segment
|
85
|
+
# route.path(":segment").to(:show)
|
86
|
+
# @example Nested paths
|
87
|
+
# route.path("/").path("foo").path(":id").to(:detail)
|
88
|
+
# @param path [String]
|
89
|
+
# @param options [Hash]
|
90
|
+
# @yield [route]
|
91
|
+
# @return [Wayfarer::Routing::Route]
|
92
|
+
def path(path, **, &)
|
93
|
+
path = File.join(File::SEPARATOR, path)
|
20
94
|
|
21
|
-
|
22
|
-
add_child_route(Matchers::Query.new(fields), path_offset, options, &block)
|
95
|
+
child_route(Matchers::Path.new(path), **, &)
|
23
96
|
end
|
24
97
|
|
25
|
-
|
26
|
-
|
98
|
+
# Match query parameters.
|
99
|
+
#
|
100
|
+
# Each key/value pair must be present *at least once*; if multiple values
|
101
|
+
# occur the last one wins (like Rack).
|
102
|
+
#
|
103
|
+
# Supported value types:
|
104
|
+
#
|
105
|
+
# * `String` - exact match
|
106
|
+
# * `Regexp` - regular expression match
|
107
|
+
# * `Integer` - exact numeric match
|
108
|
+
# * `Range` - inclusive numeric range
|
109
|
+
#
|
110
|
+
# This matcher doesn't collect `params`.
|
111
|
+
#
|
112
|
+
# @param fields [Hash{Symbol,String => String,Regexp,Integer,Range}]
|
113
|
+
# @param options [Hash]
|
114
|
+
# @yield [route]
|
115
|
+
# @return [Wayfarer::Routing::Route]
|
116
|
+
#
|
117
|
+
# @example Simple parameter
|
118
|
+
# route.query(foo: "bar").to(:index)
|
119
|
+
# @example Page range
|
120
|
+
# route.query(page: 5..12).to(:index)
|
121
|
+
def query(fields, **, &)
|
122
|
+
child_route(Matchers::Query.new(fields), **, &)
|
27
123
|
end
|
28
124
|
|
29
|
-
|
30
|
-
|
125
|
+
# Match URL schemes (protocols).
|
126
|
+
#
|
127
|
+
# This matcher doesn't collect `params`.
|
128
|
+
#
|
129
|
+
# @param scheme [String, Symbol]
|
130
|
+
# @param options [Hash]
|
131
|
+
# @yield [route]
|
132
|
+
# @return [Wayfarer::Routing::Route]
|
133
|
+
#
|
134
|
+
# @example HTTPS vs HTTP
|
135
|
+
# route.scheme(:https).to(:tls)
|
136
|
+
# route.scheme(:http).to(:plain)
|
137
|
+
def scheme(scheme, **, &)
|
138
|
+
child_route(Matchers::Scheme.new(scheme), **, &)
|
31
139
|
end
|
32
140
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
141
|
+
# Declares the action for the current route branch. An action is a symbol
|
142
|
+
# for an instance method, or a {Wayfarer::Handler}.
|
143
|
+
#
|
144
|
+
# In case of conflicting actions for a matching route path, the last
|
145
|
+
# matched action takes precedence.
|
146
|
+
#
|
147
|
+
# @param action [Symbol, Wayfarer::Handler] method or {Handler} to call.
|
148
|
+
# @param options [Hash]
|
149
|
+
# @yield [route]
|
150
|
+
# @return [Wayfarer::Routing::TargetRoute]
|
151
|
+
#
|
152
|
+
# @example Last action wins
|
153
|
+
# route.to(:alpha).to(:beta) # => routes to :beta
|
154
|
+
def to(action, **, &)
|
155
|
+
child_route(nil, action: action, klass: TargetRoute, **, &)
|
37
156
|
end
|
38
157
|
|
39
|
-
|
40
|
-
|
158
|
+
# Match URLs dynamically by declaring a route from a block during route
|
159
|
+
# evaluation. Custom matchers are passed a transient root route which will
|
160
|
+
# be followed. Custom matchers match when their dynamically declared
|
161
|
+
# subtree matches the URL.
|
162
|
+
#
|
163
|
+
# @param options [Hash]
|
164
|
+
# @yield [root, uri, task]
|
165
|
+
# @yieldparam [Wayfarer::Routing::RootRoute] root route to populate
|
166
|
+
# @yieldparam uri [Addressable::URI] parsed task URL
|
167
|
+
# @yieldparam task [Wayfarer::Task] current task
|
168
|
+
# @return [Wayfarer::Routing::Route]
|
169
|
+
#
|
170
|
+
# @example Batch routing
|
171
|
+
# route.custom do |root, _uri, task|
|
172
|
+
# database_record = Crawl.find_by(batch: task.batch)
|
173
|
+
# root.host(database_record.hostname_to_crawl).to(:index)
|
174
|
+
# end
|
175
|
+
def custom(**, &block)
|
176
|
+
child_route(Matchers::Custom.new(block), klass: SubRoute, **)
|
41
177
|
end
|
42
178
|
|
43
179
|
private
|
44
180
|
|
45
|
-
#
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
181
|
+
# @param matcher [Wayfarer::Routing::Matcher, nil]
|
182
|
+
# @param klass [Class<Route>]
|
183
|
+
# @param options [Hash]
|
184
|
+
# @yield [route]
|
185
|
+
# @return [Wayfarer::Routing::Route]
|
186
|
+
def child_route(matcher, klass: Route, **, &)
|
187
|
+
klass.new(matcher: matcher, parent: self, **, &).tap do |route|
|
188
|
+
children.append(route)
|
52
189
|
end
|
53
190
|
end
|
54
|
-
# rubocop:enable Style/OptionalArguments
|
55
191
|
end
|
56
192
|
end
|
57
193
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Routing
|
5
|
+
class HashStack
|
6
|
+
EmptyStackError = Class.new(StandardError)
|
7
|
+
|
8
|
+
def self.empty
|
9
|
+
new(Route::EMPTY_PARAMS)
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(initial_state)
|
13
|
+
@stack = [initial_state]
|
14
|
+
end
|
15
|
+
|
16
|
+
def push(hash)
|
17
|
+
stack.push(stack.last.dup.merge!(hash))
|
18
|
+
end
|
19
|
+
|
20
|
+
def pop
|
21
|
+
stack.pop || raise(EmptyStackError)
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_h
|
25
|
+
stack.last
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
attr_reader :stack
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -5,19 +5,22 @@ module Wayfarer
|
|
5
5
|
module Matchers
|
6
6
|
class Custom
|
7
7
|
include Stringify
|
8
|
+
include EmptyParams
|
8
9
|
|
9
10
|
attr_reader :delegate
|
10
11
|
|
11
|
-
def initialize(delegate
|
12
|
+
def initialize(delegate)
|
12
13
|
@delegate = delegate
|
13
14
|
end
|
14
15
|
|
15
|
-
def
|
16
|
-
|
16
|
+
def evaluate(path_finder)
|
17
|
+
Wayfarer::Routing::RootRoute.new.tap do |route|
|
18
|
+
delegate.call(route, path_finder.uri, path_finder.task)
|
19
|
+
end
|
17
20
|
end
|
18
21
|
|
19
|
-
def
|
20
|
-
{}
|
22
|
+
def to_h
|
23
|
+
{ custom: delegate.class.name }
|
21
24
|
end
|
22
25
|
end
|
23
26
|
end
|