wayfarer 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +28 -1
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +1 -1
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
@@ -6,12 +6,20 @@ module Wayfarer
|
|
6
6
|
extend Base
|
7
7
|
|
8
8
|
def call(task)
|
9
|
-
controller = task
|
9
|
+
controller = task[:controller]
|
10
10
|
|
11
|
-
controller.run_callbacks(:action) do
|
12
|
-
case action = task
|
11
|
+
task[:return_value] = controller.run_callbacks(:action) do
|
12
|
+
case action = task[:action]
|
13
13
|
when Symbol then controller.public_send(action)
|
14
|
-
|
14
|
+
when Array
|
15
|
+
handler, method = action
|
16
|
+
task[:action] = method
|
17
|
+
handler.new.call(task)
|
18
|
+
else
|
19
|
+
raise ArgumentError, "invalid action: #{action.inspect}" unless action&.include?(Wayfarer::Handler)
|
20
|
+
|
21
|
+
task[:action] = nil # TODO: Test
|
22
|
+
action.new.call(task)
|
15
23
|
end
|
16
24
|
end
|
17
25
|
|
@@ -4,23 +4,24 @@ module Wayfarer
|
|
4
4
|
module Middleware
|
5
5
|
class Normalize
|
6
6
|
extend Base
|
7
|
+
include Wayfarer::Logging.emit(
|
8
|
+
invalid: [:info, "Failed to normalize HTTP(S) URL"]
|
9
|
+
)
|
7
10
|
|
8
|
-
def
|
9
|
-
|
11
|
+
def self.normalize(uri)
|
12
|
+
return uri.to_s unless %w[http https].include?(uri.scheme)
|
10
13
|
|
11
|
-
|
14
|
+
NormalizeUrl.process(uri)
|
15
|
+
rescue NormalizeUrl::InvalidURIError
|
16
|
+
nil
|
12
17
|
end
|
13
18
|
|
14
|
-
|
19
|
+
def call(task)
|
20
|
+
return (yield if block_given?) if task[:normalized_url]
|
15
21
|
|
16
|
-
|
17
|
-
task.metadata.staged_urls.map(&method(:normalize))
|
18
|
-
end
|
22
|
+
return log(:invalid, task) unless (task[:normalized_url] = self.class.normalize(task[:uri]))
|
19
23
|
|
20
|
-
|
21
|
-
NormalizeUrl.process(url)
|
22
|
-
rescue NormalizeUrl::InvalidURIError
|
23
|
-
nil
|
24
|
+
yield if block_given?
|
24
25
|
end
|
25
26
|
end
|
26
27
|
end
|
@@ -5,51 +5,49 @@ module Wayfarer
|
|
5
5
|
class Router
|
6
6
|
extend Base
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
module ClassMethods
|
15
|
-
def router
|
16
|
-
# TODO: Use cattr_accessor
|
17
|
-
@router ||= Wayfarer::Routing::Router.new
|
18
|
-
end
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
mismatch: [:info, "No matching route"],
|
10
|
+
match: [:info, "Routing to %<action>s"],
|
11
|
+
already_routed: [:debug, "Already routed to %<action>s"]
|
12
|
+
)
|
19
13
|
|
20
|
-
|
21
|
-
|
22
|
-
end
|
14
|
+
module API
|
15
|
+
extend ActiveSupport::Concern
|
23
16
|
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
included do
|
18
|
+
class_attribute :route,
|
19
|
+
default: Wayfarer::Routing::RootRoute.new,
|
20
|
+
instance_accessor: false,
|
21
|
+
instance_predicate: false
|
27
22
|
end
|
28
23
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
24
|
+
def action
|
25
|
+
task[:action]
|
26
|
+
end
|
33
27
|
|
34
|
-
|
35
|
-
|
36
|
-
end
|
28
|
+
def params
|
29
|
+
task[:params]
|
37
30
|
end
|
38
31
|
end
|
39
32
|
|
40
33
|
def call(task)
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
# Avoid rerouting when dispatching a [Controller, :action] pair
|
35
|
+
if (action = task[:action])
|
36
|
+
log(:already_routed, task, action: action)
|
37
|
+
|
38
|
+
return (yield if block_given?)
|
39
|
+
end
|
40
|
+
|
41
|
+
case result = task[:controller].class.route.invoke(task[:uri])
|
42
|
+
when Routing::Result::Mismatch then return log(:mismatch, task)
|
49
43
|
when Routing::Result::Match
|
50
|
-
|
51
|
-
|
52
|
-
task.
|
44
|
+
action = result.action
|
45
|
+
|
46
|
+
log(:match, task, action: action.inspect)
|
47
|
+
|
48
|
+
task[:action] = action
|
49
|
+
task[:params] ||= ActiveSupport::HashWithIndifferentAccess.new
|
50
|
+
task[:params].merge!(result.params)
|
53
51
|
end
|
54
52
|
|
55
53
|
yield if block_given?
|
@@ -7,20 +7,20 @@ module Wayfarer
|
|
7
7
|
|
8
8
|
module API
|
9
9
|
def stage(urls)
|
10
|
-
Array.wrap(urls).each { |url| task
|
10
|
+
Array.wrap(urls).each { |url| task[:staged_urls].add(url.to_s) }
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
14
|
def call(task)
|
15
|
-
task
|
15
|
+
task[:staged_urls] = Set.new
|
16
16
|
|
17
17
|
yield if block_given?
|
18
18
|
|
19
|
-
task
|
20
|
-
task
|
19
|
+
task[:staged_urls].each do |url|
|
20
|
+
task[:job].class.crawl(url, batch: task.batch)
|
21
21
|
end
|
22
22
|
|
23
|
-
task
|
23
|
+
task[:staged_urls].clear
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class UriParser
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
include Wayfarer::Logging.emit(
|
9
|
+
invalid: [:info, "Not processing invalid URL (%<message>s)"]
|
10
|
+
)
|
11
|
+
|
12
|
+
module API
|
13
|
+
def uri
|
14
|
+
task[:uri]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def call(task)
|
19
|
+
# TODO: Test
|
20
|
+
task[:uri] ||= begin
|
21
|
+
Addressable::URI.parse(task.url).normalize
|
22
|
+
rescue Addressable::URI::InvalidURIError => e
|
23
|
+
return log(:invalid, task, message: e.message)
|
24
|
+
end
|
25
|
+
|
26
|
+
yield if block_given?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Middleware
|
5
|
+
class UserAgent
|
6
|
+
extend Base
|
7
|
+
|
8
|
+
module API
|
9
|
+
def user_agent
|
10
|
+
task[:context]&.instance
|
11
|
+
end
|
12
|
+
|
13
|
+
def page(live: false)
|
14
|
+
return task[:page] unless live
|
15
|
+
|
16
|
+
task[:page] = task[:context].live&.page || task[:page]
|
17
|
+
end
|
18
|
+
|
19
|
+
def fetch(url, follow: 3)
|
20
|
+
(@http ||= Wayfarer::Networking::Follow.http).fetch(url, follow: follow)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def call(task)
|
25
|
+
pool.with do |context|
|
26
|
+
task[:context] = context
|
27
|
+
|
28
|
+
result = task[:controller].run_callbacks(:fetch) do
|
29
|
+
context.fetch(task.url)
|
30
|
+
end
|
31
|
+
|
32
|
+
case result
|
33
|
+
when Networking::Result::Redirect
|
34
|
+
task[:controller].stage(result.redirect_url)
|
35
|
+
when Networking::Result::Success
|
36
|
+
task[:page] = result.page
|
37
|
+
yield if block_given?
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def pool
|
45
|
+
Wayfarer::Networking::Pool.instance
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -25,8 +25,8 @@ module Wayfarer
|
|
25
25
|
|
26
26
|
def supervise
|
27
27
|
yield
|
28
|
-
rescue *strategy.renew_on, *Wayfarer.config
|
29
|
-
renew
|
28
|
+
rescue *strategy.renew_on, *Wayfarer.config[:network][:renew_on] => e
|
29
|
+
renew # may raise
|
30
30
|
ensure
|
31
31
|
# If renewing raises, re-raise the originally caught exception
|
32
32
|
# TODO: Not nice this effectively swallows exceptions
|
@@ -10,8 +10,8 @@ module Wayfarer
|
|
10
10
|
end
|
11
11
|
|
12
12
|
def create
|
13
|
-
::Ferrum::Browser.new(Wayfarer.config
|
14
|
-
browser.headers.set(Wayfarer.config
|
13
|
+
::Ferrum::Browser.new(Wayfarer.config[:ferrum][:options]).tap do |browser|
|
14
|
+
browser.headers.set(Wayfarer.config[:network][:http_headers])
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
@@ -2,15 +2,21 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Networking
|
5
|
-
|
5
|
+
class Follow
|
6
|
+
RedirectsExhaustedError = Class.new(StandardError)
|
6
7
|
|
7
|
-
|
8
|
-
|
8
|
+
def self.http
|
9
|
+
new(Wayfarer::Networking::Context.new(Wayfarer::Networking::HTTP.new))
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :context
|
9
13
|
|
10
|
-
|
14
|
+
def initialize(context)
|
15
|
+
@context = context
|
16
|
+
end
|
11
17
|
|
12
|
-
def fetch(url, follow:
|
13
|
-
raise RedirectsExhaustedError if follow
|
18
|
+
def fetch(url, follow:)
|
19
|
+
raise RedirectsExhaustedError if follow < 0
|
14
20
|
|
15
21
|
case result = context.fetch(url)
|
16
22
|
when Result::Success then result.page
|
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
|
10
10
|
def create
|
11
11
|
Net::HTTP::Persistent.new(name: CONNECTION_NAME).tap do |conn|
|
12
|
-
Wayfarer.config
|
12
|
+
Wayfarer.config[:network][:http_headers].each do |key, val|
|
13
13
|
conn.override_headers[key] = val
|
14
14
|
end
|
15
15
|
end
|
@@ -5,25 +5,30 @@ module Wayfarer
|
|
5
5
|
class Pool
|
6
6
|
include Singleton
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class_attribute :registry,
|
9
|
+
default: { http: HTTP,
|
10
|
+
ferrum: Ferrum,
|
11
|
+
selenium: Selenium,
|
12
|
+
capybara: Capybara },
|
13
|
+
instance_accessor: false,
|
14
|
+
instance_predicate: false
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@pool = ConnectionPool.new(size: Wayfarer.config[:network][:pool_size],
|
18
|
+
timeout: Wayfarer.config[:network][:pool_timeout],
|
19
|
+
&method(:context))
|
20
|
+
|
21
|
+
at_exit { free }
|
17
22
|
end
|
18
23
|
|
19
24
|
def with(&block)
|
20
|
-
pool.with(&block)
|
25
|
+
@pool.with(&block)
|
21
26
|
rescue ConnectionPool::TimeoutError => e
|
22
27
|
raise Wayfarer::UserAgentTimeoutError, e
|
23
28
|
end
|
24
29
|
|
25
30
|
def free
|
26
|
-
pool.shutdown(&:renew)
|
31
|
+
@pool.shutdown(&:renew)
|
27
32
|
end
|
28
33
|
|
29
34
|
private
|
@@ -33,7 +38,7 @@ module Wayfarer
|
|
33
38
|
end
|
34
39
|
|
35
40
|
def strategy
|
36
|
-
self.class.registry[Wayfarer.config
|
41
|
+
self.class.registry[Wayfarer.config[:network][:agent]].new
|
37
42
|
end
|
38
43
|
end
|
39
44
|
end
|
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
MOCK_RESPONSE_HEADERS = {}.freeze
|
10
10
|
|
11
11
|
def create
|
12
|
-
::Selenium::WebDriver.for(Wayfarer.config
|
12
|
+
::Selenium::WebDriver.for(Wayfarer.config[:selenium][:driver], **options)
|
13
13
|
end
|
14
14
|
|
15
15
|
def destroy(instance)
|
@@ -30,12 +30,12 @@ module Wayfarer
|
|
30
30
|
private
|
31
31
|
|
32
32
|
def options
|
33
|
-
Wayfarer.config
|
33
|
+
Wayfarer.config[:selenium][:options].merge(http_client: http_client)
|
34
34
|
end
|
35
35
|
|
36
36
|
def http_client
|
37
37
|
::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
|
38
|
-
client.read_timeout = Wayfarer.config
|
38
|
+
client.read_timeout = Wayfarer.config[:selenium][:client_timeout]
|
39
39
|
end
|
40
40
|
end
|
41
41
|
end
|
@@ -13,13 +13,13 @@ module Wayfarer
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def navigate(_instance, _url)
|
16
|
-
raise
|
16
|
+
raise NotImplementedError
|
17
17
|
end
|
18
18
|
|
19
19
|
def live(_instance); end
|
20
20
|
|
21
21
|
def create
|
22
|
-
raise
|
22
|
+
raise NotImplementedError
|
23
23
|
end
|
24
24
|
|
25
25
|
def destroy(_instance); end
|
data/lib/wayfarer/page.rb
CHANGED
@@ -1,12 +1,22 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
+
# @!attribute [r] url
|
5
|
+
# @return [String] the URL that was fetched
|
6
|
+
# @!attribute [r] status_code
|
7
|
+
# @return [Fixnum] HTTP status code
|
8
|
+
# @!attribute [r] body
|
9
|
+
# @return [String] the body of the response
|
10
|
+
# @!attribute [r] headers
|
11
|
+
# @return [Hash] the headers of the response
|
12
|
+
# @note HTTP header keys are downcased, for example: `content-type`.
|
4
13
|
class Page
|
5
14
|
attr_reader :url,
|
6
15
|
:status_code,
|
7
16
|
:body,
|
8
17
|
:headers
|
9
18
|
|
19
|
+
# @!visibility private
|
10
20
|
def initialize(url:, status_code:, body:, headers:)
|
11
21
|
@url = url
|
12
22
|
@status_code = status_code
|
@@ -14,24 +24,36 @@ module Wayfarer
|
|
14
24
|
@headers = headers.transform_keys(&:downcase)
|
15
25
|
end
|
16
26
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
content_type = headers["content-type"]
|
24
|
-
sub_type = MIME::Types[content_type].first.sub_type
|
27
|
+
# Returns the MIME type of the response.
|
28
|
+
# @return [MIME::Type]
|
29
|
+
# @see https://www.rubydoc.info/gems/mime-types/MIME/Type
|
30
|
+
def mime_type
|
31
|
+
@mime_type ||= MIME::Types[content_type]&.first
|
32
|
+
end
|
25
33
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
34
|
+
# Returns a parsed representation of the HTTP response or the browser DOM,
|
35
|
+
# depending on the Content-Type.
|
36
|
+
# @return [Nokogiri::HTML::Document] when Content-Type is `text/html`
|
37
|
+
# @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document Nokogiri::HTML::Document
|
38
|
+
# @return [Nokogiri::XML::Document] when Content-Type is `text/xml`
|
39
|
+
# @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document Nokogiri::XML::Document
|
40
|
+
# @return [Hash] when Content-Type is `application/json`
|
41
|
+
# @note You can register custom parsers with {Wayfarer::Parsing.registry}.
|
42
|
+
def doc
|
43
|
+
@doc ||= Wayfarer::Parsing.parse(body, mime_type&.content_type || content_type)
|
31
44
|
end
|
32
45
|
|
46
|
+
# Returns a `MetaInspector::Document`.
|
47
|
+
# @return [MetaInspector::Document]
|
48
|
+
# @see https://www.rubydoc.info/gems/metainspector/MetaInspector/Document
|
33
49
|
def meta
|
34
|
-
@meta ||= MetaInspector.new(url, document: body)
|
50
|
+
@meta ||= MetaInspector.new(url, document: body, headers: headers, normalize_url: false)
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def content_type
|
56
|
+
@content_type ||= headers["content-type"]
|
35
57
|
end
|
36
58
|
end
|
37
59
|
end
|
data/lib/wayfarer/parsing/xml.rb
CHANGED
@@ -5,12 +5,12 @@ module Wayfarer
|
|
5
5
|
module XML
|
6
6
|
module_function
|
7
7
|
|
8
|
-
def
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
def parse(xml, variant)
|
9
|
+
case variant
|
10
|
+
when :xml then Nokogiri::XML(xml)
|
11
|
+
when :html then Nokogiri::HTML(xml)
|
12
|
+
else raise ArgumentError, "Unknown type: #{type}"
|
13
|
+
end
|
14
14
|
end
|
15
15
|
end
|
16
16
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# @!scope class
|
5
|
+
# @!attribute [r] registry
|
6
|
+
# @return [Hash] Mapping of Content-Type to parser.
|
7
|
+
module Parsing
|
8
|
+
# @!visibility private
|
9
|
+
FALLBACK_CONTENT_TYPE = "application/octet-stream"
|
10
|
+
|
11
|
+
mattr_accessor :registry, default: { "application/json" => JSON,
|
12
|
+
"text/html" => [XML, :html],
|
13
|
+
"application/xml" => [XML, :xml] }
|
14
|
+
|
15
|
+
module_function
|
16
|
+
|
17
|
+
# @!visibility private
|
18
|
+
def parse(body, content_type = FALLBACK_CONTENT_TYPE)
|
19
|
+
parser, args = registry[content_type] || return
|
20
|
+
|
21
|
+
parser.parse(body, *args)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -2,35 +2,27 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Redis
|
5
|
-
Barrier
|
6
|
-
include
|
5
|
+
class Barrier
|
6
|
+
include Resettable
|
7
7
|
|
8
|
-
|
9
|
-
"wayfarer-barrier-#{batch}"
|
10
|
-
end
|
8
|
+
attr_reader :task
|
11
9
|
|
12
|
-
def
|
13
|
-
|
10
|
+
def initialize(task)
|
11
|
+
@task = task
|
12
|
+
@redis_pool = task[:redis_pool]
|
14
13
|
end
|
15
14
|
|
16
|
-
def
|
17
|
-
|
15
|
+
def redis_key
|
16
|
+
"wayfarer-barrier-#{task.batch}"
|
18
17
|
end
|
19
18
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
# SMISMEMBER is only supported on Redis >= 6.2.0
|
24
|
-
if major > 6 || (major == 6 && minor >= 2)
|
25
|
-
redis { |conn| conn.smismember(redis_key, urls) }.map { |val| val == 1 }
|
26
|
-
else
|
27
|
-
urls.map { |url| redis { |conn| conn.sismember(redis_key, url) } }
|
28
|
-
end
|
19
|
+
def check!(url)
|
20
|
+
!redis_pool.with { |conn| conn.hsetnx(redis_key, url, "") }
|
29
21
|
end
|
30
22
|
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
private
|
24
|
+
|
25
|
+
attr_reader :redis_pool
|
34
26
|
end
|
35
27
|
end
|
36
28
|
end
|
@@ -2,28 +2,38 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Redis
|
5
|
-
Counter
|
6
|
-
include
|
5
|
+
class Counter
|
6
|
+
include Resettable
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
attr_reader :task
|
9
|
+
|
10
|
+
def initialize(task, &callback)
|
11
|
+
@task = task
|
12
|
+
@callback = callback
|
13
|
+
@redis_pool = task[:redis_pool]
|
10
14
|
end
|
11
15
|
|
12
|
-
def
|
13
|
-
|
16
|
+
def redis_key
|
17
|
+
"wayfarer-counter-#{@task.batch}"
|
14
18
|
end
|
15
19
|
|
16
20
|
def value
|
17
|
-
|
21
|
+
redis_pool.with { |conn| conn.get(redis_key) }.to_i
|
18
22
|
end
|
19
23
|
|
20
24
|
def increment
|
21
|
-
|
25
|
+
redis_pool.with { |conn| conn.incr(redis_key) }
|
22
26
|
end
|
23
27
|
|
24
28
|
def decrement
|
25
|
-
|
29
|
+
redis_pool.with { |conn| conn.decr(redis_key) }.tap do |val|
|
30
|
+
@callback&.call if val == 0
|
31
|
+
end
|
26
32
|
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
attr_reader :redis_pool
|
27
37
|
end
|
28
38
|
end
|
29
39
|
end
|