wayfarer 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +29 -2
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +17 -0
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -31
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -42
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -26
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
@@ -6,12 +6,20 @@ module Wayfarer
|
|
6
6
|
extend Base
|
7
7
|
|
8
8
|
def call(task)
|
9
|
-
controller = task
|
9
|
+
controller = task[:controller]
|
10
10
|
|
11
|
-
controller.run_callbacks(:action) do
|
12
|
-
case action = task
|
11
|
+
task[:return_value] = controller.run_callbacks(:action) do
|
12
|
+
case action = task[:action]
|
13
13
|
when Symbol then controller.public_send(action)
|
14
|
-
|
14
|
+
when Array
|
15
|
+
handler, method = action
|
16
|
+
task[:action] = method
|
17
|
+
handler.new.call(task)
|
18
|
+
else
|
19
|
+
raise ArgumentError, "invalid action: #{action.inspect}" unless action&.include?(Wayfarer::Handler)
|
20
|
+
|
21
|
+
task[:action] = nil # TODO: Test
|
22
|
+
action.new.call(task)
|
15
23
|
end
|
16
24
|
end
|
17
25
|
|
@@ -4,23 +4,24 @@ module Wayfarer
|
|
4
4
|
module Middleware
|
5
5
|
class Normalize
|
6
6
|
extend Base
|
7
|
+
include Wayfarer::Logging.emit(
|
8
|
+
invalid: [:info, "Failed to normalize HTTP(S) URL"]
|
9
|
+
)
|
7
10
|
|
8
|
-
def
|
9
|
-
|
11
|
+
def self.normalize(uri)
|
12
|
+
return uri.to_s unless %w[http https].include?(uri.scheme)
|
10
13
|
|
11
|
-
|
14
|
+
NormalizeUrl.process(uri)
|
15
|
+
rescue NormalizeUrl::InvalidURIError
|
16
|
+
nil
|
12
17
|
end
|
13
18
|
|
14
|
-
|
19
|
+
def call(task)
|
20
|
+
return (yield if block_given?) if task[:normalized_url]
|
15
21
|
|
16
|
-
|
17
|
-
task.metadata.staged_urls.map(&method(:normalize))
|
18
|
-
end
|
22
|
+
return log(:invalid, task) unless (task[:normalized_url] = self.class.normalize(task[:uri]))
|
19
23
|
|
20
|
-
|
21
|
-
NormalizeUrl.process(url)
|
22
|
-
rescue NormalizeUrl::InvalidURIError
|
23
|
-
nil
|
24
|
+
yield if block_given?
|
24
25
|
end
|
25
26
|
end
|
26
27
|
end
|
@@ -5,51 +5,49 @@ module Wayfarer
|
|
5
5
|
class Router
|
6
6
|
extend Base
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
module ClassMethods
|
15
|
-
def router
|
16
|
-
# TODO: Use cattr_accessor
|
17
|
-
@router ||= Wayfarer::Routing::Router.new
|
18
|
-
end
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
mismatch: [:info, "No matching route"],
|
10
|
+
match: [:info, "Routing to %<action>s"],
|
11
|
+
already_routed: [:debug, "Already routed to %<action>s"]
|
12
|
+
)
|
19
13
|
|
20
|
-
|
21
|
-
|
22
|
-
end
|
14
|
+
module API
|
15
|
+
extend ActiveSupport::Concern
|
23
16
|
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
included do
|
18
|
+
class_attribute :route,
|
19
|
+
default: Wayfarer::Routing::RootRoute.new,
|
20
|
+
instance_accessor: false,
|
21
|
+
instance_predicate: false
|
27
22
|
end
|
28
23
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
24
|
+
def action
|
25
|
+
task[:action]
|
26
|
+
end
|
33
27
|
|
34
|
-
|
35
|
-
|
36
|
-
end
|
28
|
+
def params
|
29
|
+
task[:params]
|
37
30
|
end
|
38
31
|
end
|
39
32
|
|
40
33
|
def call(task)
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
# Avoid rerouting when dispatching a [Controller, :action] pair
|
35
|
+
if (action = task[:action])
|
36
|
+
log(:already_routed, task, action: action)
|
37
|
+
|
38
|
+
return (yield if block_given?)
|
39
|
+
end
|
40
|
+
|
41
|
+
case result = task[:controller].class.route.invoke(task[:uri])
|
42
|
+
when Routing::Result::Mismatch then return log(:mismatch, task)
|
49
43
|
when Routing::Result::Match
|
50
|
-
|
51
|
-
|
52
|
-
task.
|
44
|
+
action = result.action
|
45
|
+
|
46
|
+
log(:match, task, action: action.inspect)
|
47
|
+
|
48
|
+
task[:action] = action
|
49
|
+
task[:params] ||= ActiveSupport::HashWithIndifferentAccess.new
|
50
|
+
task[:params].merge!(result.params)
|
53
51
|
end
|
54
52
|
|
55
53
|
yield if block_given?
|
@@ -7,20 +7,20 @@ module Wayfarer
|
|
7
7
|
|
8
8
|
module API
|
9
9
|
def stage(urls)
|
10
|
-
Array.wrap(urls).each { |url| task
|
10
|
+
Array.wrap(urls).each { |url| task[:staged_urls].add(url.to_s) }
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
14
|
def call(task)
|
15
|
-
task
|
15
|
+
task[:staged_urls] = Set.new
|
16
16
|
|
17
17
|
yield if block_given?
|
18
18
|
|
19
|
-
task
|
20
|
-
task
|
19
|
+
task[:staged_urls].each do |url|
|
20
|
+
task[:job].class.crawl(url, batch: task.batch)
|
21
21
|
end
|
22
22
|
|
23
|
-
task
|
23
|
+
task[:staged_urls].clear
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class UriParser
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
invalid: [:info, "Not processing invalid URL (%<message>s)"]
|
10
|
+
)
|
11
|
+
|
12
|
+
module API
|
13
|
+
def uri
|
14
|
+
task[:uri]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def call(task)
|
19
|
+
# TODO: Test
|
20
|
+
task[:uri] ||= begin
|
21
|
+
Addressable::URI.parse(task.url).normalize
|
22
|
+
rescue Addressable::URI::InvalidURIError => e
|
23
|
+
return log(:invalid, task, message: e.message)
|
24
|
+
end
|
25
|
+
|
26
|
+
yield if block_given?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class UserAgent
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
module API
|
9
|
+
def user_agent
|
10
|
+
task[:context]&.instance
|
11
|
+
end
|
12
|
+
|
13
|
+
def page(live: false)
|
14
|
+
return task[:page] unless live
|
15
|
+
|
16
|
+
task[:page] = task[:context].live&.page || task[:page]
|
17
|
+
end
|
18
|
+
|
19
|
+
def fetch(url, follow: 3)
|
20
|
+
(@http ||= Wayfarer::Networking::Follow.http).fetch(url, follow: follow)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def call(task)
|
25
|
+
pool.with do |context|
|
26
|
+
task[:context] = context
|
27
|
+
|
28
|
+
result = task[:controller].run_callbacks(:fetch) do
|
29
|
+
context.fetch(task.url)
|
30
|
+
end
|
31
|
+
|
32
|
+
case result
|
33
|
+
when Networking::Result::Redirect
|
34
|
+
task[:controller].stage(result.redirect_url)
|
35
|
+
when Networking::Result::Success
|
36
|
+
task[:page] = result.page
|
37
|
+
yield if block_given?
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def pool
|
45
|
+
Wayfarer::Networking::Pool.instance
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -25,8 +25,8 @@ module Wayfarer
|
|
25
25
|
|
26
26
|
def supervise
|
27
27
|
yield
|
28
|
-
rescue *strategy.renew_on => e
|
29
|
-
renew
|
28
|
+
rescue *strategy.renew_on, *Wayfarer.config[:network][:renew_on] => e
|
29
|
+
renew # may raise
|
30
30
|
ensure
|
31
31
|
# If renewing raises, re-raise the originally caught exception
|
32
32
|
# TODO: Not nice this effectively swallows exceptions
|
@@ -10,8 +10,8 @@ module Wayfarer
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def create
|
13
|
-
::Ferrum::Browser.new(Wayfarer.config
|
14
|
-
browser.headers.set(Wayfarer.config
|
13
|
+
::Ferrum::Browser.new(Wayfarer.config[:ferrum][:options]).tap do |browser|
|
14
|
+
browser.headers.set(Wayfarer.config[:network][:http_headers])
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
@@ -2,15 +2,21 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Networking
|
5
|
-
|
5
|
+
class Follow
|
6
|
+
RedirectsExhaustedError = Class.new(StandardError)
|
6
7
|
|
7
|
-
|
8
|
-
|
8
|
+
def self.http
|
9
|
+
new(Wayfarer::Networking::Context.new(Wayfarer::Networking::HTTP.new))
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :context
|
9
13
|
|
10
|
-
|
14
|
+
def initialize(context)
|
15
|
+
@context = context
|
16
|
+
end
|
11
17
|
|
12
|
-
def fetch(url, follow:
|
13
|
-
raise RedirectsExhaustedError if follow
|
18
|
+
def fetch(url, follow:)
|
19
|
+
raise RedirectsExhaustedError if follow < 0
|
14
20
|
|
15
21
|
case result = context.fetch(url)
|
16
22
|
when Result::Success then result.page
|
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
|
10
10
|
def create
|
11
11
|
Net::HTTP::Persistent.new(name: CONNECTION_NAME).tap do |conn|
|
12
|
-
Wayfarer.config
|
12
|
+
Wayfarer.config[:network][:http_headers].each do |key, val|
|
13
13
|
conn.override_headers[key] = val
|
14
14
|
end
|
15
15
|
end
|
@@ -5,25 +5,30 @@ module Wayfarer
|
|
5
5
|
class Pool
|
6
6
|
include Singleton
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class_attribute :registry,
|
9
|
+
default: { http: HTTP,
|
10
|
+
ferrum: Ferrum,
|
11
|
+
selenium: Selenium,
|
12
|
+
capybara: Capybara },
|
13
|
+
instance_accessor: false,
|
14
|
+
instance_predicate: false
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@pool = ConnectionPool.new(size: Wayfarer.config[:network][:pool_size],
|
18
|
+
timeout: Wayfarer.config[:network][:pool_timeout],
|
19
|
+
&method(:context))
|
20
|
+
|
21
|
+
at_exit { free }
|
17
22
|
end
|
18
23
|
|
19
24
|
def with(&block)
|
20
|
-
pool.with(&block)
|
25
|
+
@pool.with(&block)
|
21
26
|
rescue ConnectionPool::TimeoutError => e
|
22
27
|
raise Wayfarer::UserAgentTimeoutError, e
|
23
28
|
end
|
24
29
|
|
25
30
|
def free
|
26
|
-
pool.shutdown(&:renew)
|
31
|
+
@pool.shutdown(&:renew)
|
27
32
|
end
|
28
33
|
|
29
34
|
private
|
@@ -33,7 +38,7 @@ module Wayfarer
|
|
33
38
|
end
|
34
39
|
|
35
40
|
def strategy
|
36
|
-
self.class.registry[Wayfarer.config
|
41
|
+
self.class.registry[Wayfarer.config[:network][:agent]].new
|
37
42
|
end
|
38
43
|
end
|
39
44
|
end
|
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
MOCK_RESPONSE_HEADERS = {}.freeze
|
10
10
|
|
11
11
|
def create
|
12
|
-
::Selenium::WebDriver.for(Wayfarer.config
|
12
|
+
::Selenium::WebDriver.for(Wayfarer.config[:selenium][:driver], **options)
|
13
13
|
end
|
14
14
|
|
15
15
|
def destroy(instance)
|
@@ -30,12 +30,12 @@ module Wayfarer
|
|
30
30
|
private
|
31
31
|
|
32
32
|
def options
|
33
|
-
Wayfarer.config
|
33
|
+
Wayfarer.config[:selenium][:options].merge(http_client: http_client)
|
34
34
|
end
|
35
35
|
|
36
36
|
def http_client
|
37
37
|
::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
|
38
|
-
client.read_timeout = Wayfarer.config
|
38
|
+
client.read_timeout = Wayfarer.config[:selenium][:client_timeout]
|
39
39
|
end
|
40
40
|
end
|
41
41
|
end
|
@@ -13,13 +13,13 @@ module Wayfarer
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def navigate(_instance, _url)
|
16
|
-
raise
|
16
|
+
raise NotImplementedError
|
17
17
|
end
|
18
18
|
|
19
19
|
def live(_instance); end
|
20
20
|
|
21
21
|
def create
|
22
|
-
raise
|
22
|
+
raise NotImplementedError
|
23
23
|
end
|
24
24
|
|
25
25
|
def destroy(_instance); end
|
data/lib/wayfarer/page.rb
CHANGED
@@ -1,12 +1,22 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
+
# @!attribute [r] url
|
5
|
+
# @return [String] the URL that was fetched
|
6
|
+
# @!attribute [r] status_code
|
7
|
+
# @return [Fixnum] HTTP status code
|
8
|
+
# @!attribute [r] body
|
9
|
+
# @return [String] the body of the response
|
10
|
+
# @!attribute [r] headers
|
11
|
+
# @return [Hash] the headers of the response
|
12
|
+
# @note HTTP header keys are downcased, for example: `content-type`.
|
4
13
|
class Page
|
5
14
|
attr_reader :url,
|
6
15
|
:status_code,
|
7
16
|
:body,
|
8
17
|
:headers
|
9
18
|
|
19
|
+
# @!visibility private
|
10
20
|
def initialize(url:, status_code:, body:, headers:)
|
11
21
|
@url = url
|
12
22
|
@status_code = status_code
|
@@ -14,24 +24,36 @@ module Wayfarer
|
|
14
24
|
@headers = headers.transform_keys(&:downcase)
|
15
25
|
end
|
16
26
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
content_type = headers["content-type"]
|
24
|
-
sub_type = MIME::Types[content_type].first.sub_type
|
27
|
+
# Returns the MIME type of the response.
|
28
|
+
# @return [MIME::Type]
|
29
|
+
# @see https://www.rubydoc.info/gems/mime-types/MIME/Type
|
30
|
+
def mime_type
|
31
|
+
@mime_type ||= MIME::Types[content_type]&.first
|
32
|
+
end
|
25
33
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
34
|
+
# Returns a parsed representation of the HTTP response or the browser DOM,
|
35
|
+
# depending on the Content-Type.
|
36
|
+
# @return [Nokogiri::HTML::Document] when Content-Type is `text/html`
|
37
|
+
# @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document Nokogiri::HTML::Document
|
38
|
+
# @return [Nokogiri::XML::Document] when Content-Type is `text/xml`
|
39
|
+
# @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document Nokogiri::XML::Document
|
40
|
+
# @return [Hash] when Content-Type is `application/json`
|
41
|
+
# @note You can register custom parsers with {Wayfarer::Parsing.registry}.
|
42
|
+
def doc
|
43
|
+
@doc ||= Wayfarer::Parsing.parse(body, mime_type&.content_type || content_type)
|
31
44
|
end
|
32
45
|
|
46
|
+
# Returns a `MetaInspector::Document`.
|
47
|
+
# @return [MetaInspector::Document]
|
48
|
+
# @see https://www.rubydoc.info/gems/metainspector/MetaInspector/Document
|
33
49
|
def meta
|
34
|
-
@meta ||= MetaInspector.new(url, document: body)
|
50
|
+
@meta ||= MetaInspector.new(url, document: body, headers: headers, normalize_url: false)
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def content_type
|
56
|
+
@content_type ||= headers["content-type"]
|
35
57
|
end
|
36
58
|
end
|
37
59
|
end
|
data/lib/wayfarer/parsing/xml.rb
CHANGED
@@ -5,12 +5,12 @@ module Wayfarer
|
|
5
5
|
module XML
|
6
6
|
module_function
|
7
7
|
|
8
|
-
def
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
def parse(xml, variant)
|
9
|
+
case variant
|
10
|
+
when :xml then Nokogiri::XML(xml)
|
11
|
+
when :html then Nokogiri::HTML(xml)
|
12
|
+
else raise ArgumentError, "Unknown type: #{type}"
|
13
|
+
end
|
14
14
|
end
|
15
15
|
end
|
16
16
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# @!scope class
|
5
|
+
# @!attribute [r] registry
|
6
|
+
# @return [Hash] Mapping of Content-Type to parser.
|
7
|
+
module Parsing
|
8
|
+
# @!visibility private
|
9
|
+
FALLBACK_CONTENT_TYPE = "application/octet-stream"
|
10
|
+
|
11
|
+
mattr_accessor :registry, default: { "application/json" => JSON,
|
12
|
+
"text/html" => [XML, :html],
|
13
|
+
"application/xml" => [XML, :xml] }
|
14
|
+
|
15
|
+
module_function
|
16
|
+
|
17
|
+
# @!visibility private
|
18
|
+
def parse(body, content_type = FALLBACK_CONTENT_TYPE)
|
19
|
+
parser, args = registry[content_type] || return
|
20
|
+
|
21
|
+
parser.parse(body, *args)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -2,35 +2,27 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Redis
|
5
|
-
Barrier
|
6
|
-
include
|
5
|
+
class Barrier
|
6
|
+
include Resettable
|
7
7
|
|
8
|
-
|
9
|
-
"wayfarer-barrier-#{batch}"
|
10
|
-
end
|
8
|
+
attr_reader :task
|
11
9
|
|
12
|
-
def
|
13
|
-
|
10
|
+
def initialize(task)
|
11
|
+
@task = task
|
12
|
+
@redis_pool = task[:redis_pool]
|
14
13
|
end
|
15
14
|
|
16
|
-
def
|
17
|
-
|
15
|
+
def redis_key
|
16
|
+
"wayfarer-barrier-#{task.batch}"
|
18
17
|
end
|
19
18
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
# SMISMEMBER is only supported on Redis >= 6.2.0
|
24
|
-
if major > 6 || (major == 6 && minor >= 2)
|
25
|
-
redis { |conn| conn.smismember(redis_key, urls) }.map { |val| val == 1 }
|
26
|
-
else
|
27
|
-
urls.map { |url| redis { |conn| conn.sismember(redis_key, url) } }
|
28
|
-
end
|
19
|
+
def check!(url)
|
20
|
+
!redis_pool.with { |conn| conn.hsetnx(redis_key, url, "") }
|
29
21
|
end
|
30
22
|
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
private
|
24
|
+
|
25
|
+
attr_reader :redis_pool
|
34
26
|
end
|
35
27
|
end
|
36
28
|
end
|
@@ -2,28 +2,38 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Redis
|
5
|
-
Counter
|
6
|
-
include
|
5
|
+
class Counter
|
6
|
+
include Resettable
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
attr_reader :task
|
9
|
+
|
10
|
+
def initialize(task, &callback)
|
11
|
+
@task = task
|
12
|
+
@callback = callback
|
13
|
+
@redis_pool = task[:redis_pool]
|
10
14
|
end
|
11
15
|
|
12
|
-
def
|
13
|
-
|
16
|
+
def redis_key
|
17
|
+
"wayfarer-counter-#{@task.batch}"
|
14
18
|
end
|
15
19
|
|
16
20
|
def value
|
17
|
-
|
21
|
+
redis_pool.with { |conn| conn.get(redis_key) }.to_i
|
18
22
|
end
|
19
23
|
|
20
24
|
def increment
|
21
|
-
|
25
|
+
redis_pool.with { |conn| conn.incr(redis_key) }
|
22
26
|
end
|
23
27
|
|
24
28
|
def decrement
|
25
|
-
|
29
|
+
redis_pool.with { |conn| conn.decr(redis_key) }.tap do |val|
|
30
|
+
@callback&.call if val == 0
|
31
|
+
end
|
26
32
|
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
attr_reader :redis_pool
|
27
37
|
end
|
28
38
|
end
|
29
39
|
end
|