wayfarer 0.4.6 → 0.4.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +28 -1
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +1 -1
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -53
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -43
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -29
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -6,12 +6,20 @@ module Wayfarer
6
6
  extend Base
7
7
 
8
8
  def call(task)
9
- controller = task.metadata.controller
9
+ controller = task[:controller]
10
10
 
11
- controller.run_callbacks(:action) do
12
- case action = task.metadata.action
11
+ task[:return_value] = controller.run_callbacks(:action) do
12
+ case action = task[:action]
13
13
  when Symbol then controller.public_send(action)
14
- else action.new.call(task)
14
+ when Array
15
+ handler, method = action
16
+ task[:action] = method
17
+ handler.new.call(task)
18
+ else
19
+ raise ArgumentError, "invalid action: #{action.inspect}" unless action&.include?(Wayfarer::Handler)
20
+
21
+ task[:action] = nil # TODO: Test
22
+ action.new.call(task)
15
23
  end
16
24
  end
17
25
 
@@ -4,23 +4,24 @@ module Wayfarer
4
4
  module Middleware
5
5
  class Normalize
6
6
  extend Base
7
+ include Wayfarer::Logging.emit(
8
+ invalid: [:info, "Failed to normalize HTTP(S) URL"]
9
+ )
7
10
 
8
- def call(task)
9
- yield if block_given?
11
+ def self.normalize(uri)
12
+ return uri.to_s unless %w[http https].include?(uri.scheme)
10
13
 
11
- task.metadata.staged_urls = SortedSet.new(normalized_urls(task).compact)
14
+ NormalizeUrl.process(uri)
15
+ rescue NormalizeUrl::InvalidURIError
16
+ nil
12
17
  end
13
18
 
14
- private
19
+ def call(task)
20
+ return (yield if block_given?) if task[:normalized_url]
15
21
 
16
- def normalized_urls(task)
17
- task.metadata.staged_urls.map(&method(:normalize))
18
- end
22
+ return log(:invalid, task) unless (task[:normalized_url] = self.class.normalize(task[:uri]))
19
23
 
20
- def normalize(url)
21
- NormalizeUrl.process(url)
22
- rescue NormalizeUrl::InvalidURIError
23
- nil
24
+ yield if block_given?
24
25
  end
25
26
  end
26
27
  end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class Redis
6
+ extend Base
7
+
8
+ def call(task)
9
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
10
+
11
+ yield if block_given?
12
+ end
13
+ end
14
+ end
15
+ end
@@ -5,51 +5,49 @@ module Wayfarer
5
5
  class Router
6
6
  extend Base
7
7
 
8
- module API
9
- def self.included(base)
10
- base.extend(ClassMethods)
11
- base.include(InstanceMethods)
12
- end
13
-
14
- module ClassMethods
15
- def router
16
- # TODO: Use cattr_accessor
17
- @router ||= Wayfarer::Routing::Router.new
18
- end
8
+ include Wayfarer::Logging.emit(
9
+ mismatch: [:info, "No matching route"],
10
+ match: [:info, "Routing to %<action>s"],
11
+ already_routed: [:debug, "Already routed to %<action>s"]
12
+ )
19
13
 
20
- def route(&block)
21
- router.draw(&block) if block_given?
22
- end
14
+ module API
15
+ extend ActiveSupport::Concern
23
16
 
24
- def steer(&block)
25
- define_method(:steer) { block.call(task) }
26
- end
17
+ included do
18
+ class_attribute :route,
19
+ default: Wayfarer::Routing::RootRoute.new,
20
+ instance_accessor: false,
21
+ instance_predicate: false
27
22
  end
28
23
 
29
- module InstanceMethods
30
- def steer
31
- []
32
- end
24
+ def action
25
+ task[:action]
26
+ end
33
27
 
34
- def params
35
- task.metadata.params
36
- end
28
+ def params
29
+ task[:params]
37
30
  end
38
31
  end
39
32
 
40
33
  def call(task)
41
- controller = task.metadata.controller
42
- # TODO: The router has to be cloned because it's not thread-safe
43
- router = controller.class.router.clone
44
- url = Addressable::URI.parse(task.url)
45
-
46
- case result = router.invoke(url, controller.steer)
47
- when Routing::Result::Mismatch
48
- return
34
+ # Avoid rerouting when dispatching a [Controller, :action] pair
35
+ if (action = task[:action])
36
+ log(:already_routed, task, action: action)
37
+
38
+ return (yield if block_given?)
39
+ end
40
+
41
+ case result = task[:controller].class.route.invoke(task[:uri])
42
+ when Routing::Result::Mismatch then return log(:mismatch, task)
49
43
  when Routing::Result::Match
50
- task.metadata.action = result.action
51
- task.metadata.params ||= ActiveSupport::HashWithIndifferentAccess.new
52
- task.metadata.params.merge!(result.params)
44
+ action = result.action
45
+
46
+ log(:match, task, action: action.inspect)
47
+
48
+ task[:action] = action
49
+ task[:params] ||= ActiveSupport::HashWithIndifferentAccess.new
50
+ task[:params].merge!(result.params)
53
51
  end
54
52
 
55
53
  yield if block_given?
@@ -7,20 +7,20 @@ module Wayfarer
7
7
 
8
8
  module API
9
9
  def stage(urls)
10
- Array.wrap(urls).each { |url| task.metadata.staged_urls.add(url.to_s) }
10
+ Array.wrap(urls).each { |url| task[:staged_urls].add(url.to_s) }
11
11
  end
12
12
  end
13
13
 
14
14
  def call(task)
15
- task.metadata.staged_urls = SortedSet.new
15
+ task[:staged_urls] = Set.new
16
16
 
17
17
  yield if block_given?
18
18
 
19
- task.metadata.staged_urls.each do |url|
20
- task.metadata.job.class.crawl(url, batch: task.batch)
19
+ task[:staged_urls].each do |url|
20
+ task[:job].class.crawl(url, batch: task.batch)
21
21
  end
22
22
 
23
- task.metadata.staged_urls.clear
23
+ task[:staged_urls].clear
24
24
  end
25
25
  end
26
26
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class UriParser
6
+ extend Base
7
+
8
+ include Wayfarer::Logging.emit(
9
+ invalid: [:info, "Not processing invalid URL (%<message>s)"]
10
+ )
11
+
12
+ module API
13
+ def uri
14
+ task[:uri]
15
+ end
16
+ end
17
+
18
+ def call(task)
19
+ # TODO: Test
20
+ task[:uri] ||= begin
21
+ Addressable::URI.parse(task.url).normalize
22
+ rescue Addressable::URI::InvalidURIError => e
23
+ return log(:invalid, task, message: e.message)
24
+ end
25
+
26
+ yield if block_given?
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class UserAgent
6
+ extend Base
7
+
8
+ module API
9
+ def user_agent
10
+ task[:context]&.instance
11
+ end
12
+
13
+ def page(live: false)
14
+ return task[:page] unless live
15
+
16
+ task[:page] = task[:context].live&.page || task[:page]
17
+ end
18
+
19
+ def fetch(url, follow: 3)
20
+ (@http ||= Wayfarer::Networking::Follow.http).fetch(url, follow: follow)
21
+ end
22
+ end
23
+
24
+ def call(task)
25
+ pool.with do |context|
26
+ task[:context] = context
27
+
28
+ result = task[:controller].run_callbacks(:fetch) do
29
+ context.fetch(task.url)
30
+ end
31
+
32
+ case result
33
+ when Networking::Result::Redirect
34
+ task[:controller].stage(result.redirect_url)
35
+ when Networking::Result::Success
36
+ task[:page] = result.page
37
+ yield if block_given?
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def pool
45
+ Wayfarer::Networking::Pool.instance
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,7 +6,7 @@ module Wayfarer
6
6
  include Strategy
7
7
 
8
8
  def create
9
- ::Capybara::Session.new(Wayfarer.config.capybara.driver, nil)
9
+ ::Capybara::Session.new(Wayfarer.config[:capybara][:driver], nil)
10
10
  end
11
11
 
12
12
  def destroy(instance)
@@ -25,8 +25,8 @@ module Wayfarer
25
25
 
26
26
  def supervise
27
27
  yield
28
- rescue *strategy.renew_on, *Wayfarer.config.network.renew_on => e
29
- renew
28
+ rescue *strategy.renew_on, *Wayfarer.config[:network][:renew_on] => e
29
+ renew # may raise
30
30
  ensure
31
31
  # If renewing raises, re-raise the originally caught exception
32
32
  # TODO: Not nice this effectively swallows exceptions
@@ -10,8 +10,8 @@ module Wayfarer
10
10
  end
11
11
 
12
12
  def create
13
- ::Ferrum::Browser.new(Wayfarer.config.ferrum.options).tap do |browser|
14
- browser.headers.set(Wayfarer.config.network.http_headers)
13
+ ::Ferrum::Browser.new(Wayfarer.config[:ferrum][:options]).tap do |browser|
14
+ browser.headers.set(Wayfarer.config[:network][:http_headers])
15
15
  end
16
16
  end
17
17
 
@@ -2,15 +2,21 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Networking
5
- RedirectsExhaustedError = Class.new(StandardError)
5
+ class Follow
6
+ RedirectsExhaustedError = Class.new(StandardError)
6
7
 
7
- Follow = Struct.new(:context) do
8
- extend Forwardable
8
+ def self.http
9
+ new(Wayfarer::Networking::Context.new(Wayfarer::Networking::HTTP.new))
10
+ end
11
+
12
+ attr_reader :context
9
13
 
10
- delegate %i[live renew instance] => :context
14
+ def initialize(context)
15
+ @context = context
16
+ end
11
17
 
12
- def fetch(url, follow: 3)
13
- raise RedirectsExhaustedError if follow.negative?
18
+ def fetch(url, follow:)
19
+ raise RedirectsExhaustedError if follow < 0
14
20
 
15
21
  case result = context.fetch(url)
16
22
  when Result::Success then result.page
@@ -9,7 +9,7 @@ module Wayfarer
9
9
 
10
10
  def create
11
11
  Net::HTTP::Persistent.new(name: CONNECTION_NAME).tap do |conn|
12
- Wayfarer.config.network.http_headers.each do |key, val|
12
+ Wayfarer.config[:network][:http_headers].each do |key, val|
13
13
  conn.override_headers[key] = val
14
14
  end
15
15
  end
@@ -5,25 +5,30 @@ module Wayfarer
5
5
  class Pool
6
6
  include Singleton
7
7
 
8
- cattr_accessor :registry, default: { http: HTTP,
9
- ferrum: Ferrum,
10
- selenium: Selenium,
11
- capybara: Capybara }
12
-
13
- def pool
14
- @pool ||= ConnectionPool.new(size: Wayfarer.config.network.pool_size,
15
- timeout: Wayfarer.config.network.pool_timeout,
16
- &method(:context))
8
+ class_attribute :registry,
9
+ default: { http: HTTP,
10
+ ferrum: Ferrum,
11
+ selenium: Selenium,
12
+ capybara: Capybara },
13
+ instance_accessor: false,
14
+ instance_predicate: false
15
+
16
+ def initialize
17
+ @pool = ConnectionPool.new(size: Wayfarer.config[:network][:pool_size],
18
+ timeout: Wayfarer.config[:network][:pool_timeout],
19
+ &method(:context))
20
+
21
+ at_exit { free }
17
22
  end
18
23
 
19
24
  def with(&block)
20
- pool.with(&block)
25
+ @pool.with(&block)
21
26
  rescue ConnectionPool::TimeoutError => e
22
27
  raise Wayfarer::UserAgentTimeoutError, e
23
28
  end
24
29
 
25
30
  def free
26
- pool.shutdown(&:renew)
31
+ @pool.shutdown(&:renew)
27
32
  end
28
33
 
29
34
  private
@@ -33,7 +38,7 @@ module Wayfarer
33
38
  end
34
39
 
35
40
  def strategy
36
- self.class.registry[Wayfarer.config.network.agent].new
41
+ self.class.registry[Wayfarer.config[:network][:agent]].new
37
42
  end
38
43
  end
39
44
  end
@@ -9,7 +9,7 @@ module Wayfarer
9
9
  MOCK_RESPONSE_HEADERS = {}.freeze
10
10
 
11
11
  def create
12
- ::Selenium::WebDriver.for(Wayfarer.config.selenium.driver, **options)
12
+ ::Selenium::WebDriver.for(Wayfarer.config[:selenium][:driver], **options)
13
13
  end
14
14
 
15
15
  def destroy(instance)
@@ -30,12 +30,12 @@ module Wayfarer
30
30
  private
31
31
 
32
32
  def options
33
- Wayfarer.config.selenium.options.merge(http_client: http_client)
33
+ Wayfarer.config[:selenium][:options].merge(http_client: http_client)
34
34
  end
35
35
 
36
36
  def http_client
37
37
  ::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
38
- client.read_timeout = Wayfarer.config.selenium.client_timeout
38
+ client.read_timeout = Wayfarer.config[:selenium][:client_timeout]
39
39
  end
40
40
  end
41
41
  end
@@ -13,13 +13,13 @@ module Wayfarer
13
13
  end
14
14
 
15
15
  def navigate(_instance, _url)
16
- raise NoMethodError
16
+ raise NotImplementedError
17
17
  end
18
18
 
19
19
  def live(_instance); end
20
20
 
21
21
  def create
22
- raise NoMethodError
22
+ raise NotImplementedError
23
23
  end
24
24
 
25
25
  def destroy(_instance); end
data/lib/wayfarer/page.rb CHANGED
@@ -1,12 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
+ # @!attribute [r] url
5
+ # @return [String] the URL that was fetched
6
+ # @!attribute [r] status_code
7
+ # @return [Fixnum] HTTP status code
8
+ # @!attribute [r] body
9
+ # @return [String] the body of the response
10
+ # @!attribute [r] headers
11
+ # @return [Hash] the headers of the response
12
+ # @note HTTP header keys are downcased, for example: `content-type`.
4
13
  class Page
5
14
  attr_reader :url,
6
15
  :status_code,
7
16
  :body,
8
17
  :headers
9
18
 
19
+ # @!visibility private
10
20
  def initialize(url:, status_code:, body:, headers:)
11
21
  @url = url
12
22
  @status_code = status_code
@@ -14,24 +24,36 @@ module Wayfarer
14
24
  @headers = headers.transform_keys(&:downcase)
15
25
  end
16
26
 
17
- def doc
18
- return @doc if @doc
19
-
20
- # If no Content-Type field is present, assume HTML/XML
21
- return @doc = Wayfarer::Parsing::XML.parse_html(body) unless headers["content-type"]
22
-
23
- content_type = headers["content-type"]
24
- sub_type = MIME::Types[content_type].first.sub_type
27
+ # Returns the MIME type of the response.
28
+ # @return [MIME::Type]
29
+ # @see https://www.rubydoc.info/gems/mime-types/MIME/Type
30
+ def mime_type
31
+ @mime_type ||= MIME::Types[content_type]&.first
32
+ end
25
33
 
26
- @doc = case sub_type
27
- when "json" then Wayfarer::Parsing::JSON.parse(body)
28
- when "xml" then Wayfarer::Parsing::XML.parse_xml(body)
29
- else Wayfarer::Parsing::XML.parse_html(body)
30
- end
34
+ # Returns a parsed representation of the HTTP response or the browser DOM,
35
+ # depending on the Content-Type.
36
+ # @return [Nokogiri::HTML::Document] when Content-Type is `text/html`
37
+ # @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document Nokogiri::HTML::Document
38
+ # @return [Nokogiri::XML::Document] when Content-Type is `text/xml`
39
+ # @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document Nokogiri::XML::Document
40
+ # @return [Hash] when Content-Type is `application/json`
41
+ # @note You can register custom parsers with {Wayfarer::Parsing.registry}.
42
+ def doc
43
+ @doc ||= Wayfarer::Parsing.parse(body, mime_type&.content_type || content_type)
31
44
  end
32
45
 
46
+ # Returns a `MetaInspector::Document`.
47
+ # @return [MetaInspector::Document]
48
+ # @see https://www.rubydoc.info/gems/metainspector/MetaInspector/Document
33
49
  def meta
34
- @meta ||= MetaInspector.new(url, document: body)
50
+ @meta ||= MetaInspector.new(url, document: body, headers: headers, normalize_url: false)
51
+ end
52
+
53
+ private
54
+
55
+ def content_type
56
+ @content_type ||= headers["content-type"]
35
57
  end
36
58
  end
37
59
  end
@@ -5,12 +5,12 @@ module Wayfarer
5
5
  module XML
6
6
  module_function
7
7
 
8
- def parse_xml(xml)
9
- Nokogiri::XML(xml)
10
- end
11
-
12
- def parse_html(html)
13
- Nokogiri::HTML(html)
8
+ def parse(xml, variant)
9
+ case variant
10
+ when :xml then Nokogiri::XML(xml)
11
+ when :html then Nokogiri::HTML(xml)
12
+ else raise ArgumentError, "Unknown type: #{type}"
13
+ end
14
14
  end
15
15
  end
16
16
  end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # @!scope class
5
+ # @!attribute [r] registry
6
+ # @return [Hash] Mapping of Content-Type to parser.
7
+ module Parsing
8
+ # @!visibility private
9
+ FALLBACK_CONTENT_TYPE = "application/octet-stream"
10
+
11
+ mattr_accessor :registry, default: { "application/json" => JSON,
12
+ "text/html" => [XML, :html],
13
+ "application/xml" => [XML, :xml] }
14
+
15
+ module_function
16
+
17
+ # @!visibility private
18
+ def parse(body, content_type = FALLBACK_CONTENT_TYPE)
19
+ parser, args = registry[content_type] || return
20
+
21
+ parser.parse(body, *args)
22
+ end
23
+ end
24
+ end
@@ -2,35 +2,27 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Redis
5
- Barrier = Struct.new(:batch) do
6
- include Connection
5
+ class Barrier
6
+ include Resettable
7
7
 
8
- def redis_key
9
- "wayfarer-barrier-#{batch}"
10
- end
8
+ attr_reader :task
11
9
 
12
- def reset!
13
- redis { |conn| conn.del(redis_key) }
10
+ def initialize(task)
11
+ @task = task
12
+ @redis_pool = task[:redis_pool]
14
13
  end
15
14
 
16
- def seen?(url)
17
- !redis { |conn| conn.sadd(redis_key, url) }
15
+ def redis_key
16
+ "wayfarer-barrier-#{task.batch}"
18
17
  end
19
18
 
20
- def peek(urls)
21
- major, minor, = Version.determine
22
-
23
- # SMISMEMBER is only supported on Redis >= 6.2.0
24
- if major > 6 || (major == 6 && minor >= 2)
25
- redis { |conn| conn.smismember(redis_key, urls) }.map { |val| val == 1 }
26
- else
27
- urls.map { |url| redis { |conn| conn.sismember(redis_key, url) } }
28
- end
19
+ def check!(url)
20
+ !redis_pool.with { |conn| conn.hsetnx(redis_key, url, "") }
29
21
  end
30
22
 
31
- def unsee(url)
32
- redis { |conn| conn.srem(redis_key, url) }
33
- end
23
+ private
24
+
25
+ attr_reader :redis_pool
34
26
  end
35
27
  end
36
28
  end
@@ -2,28 +2,38 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Redis
5
- Counter = Struct.new(:batch) do
6
- include Connection
5
+ class Counter
6
+ include Resettable
7
7
 
8
- def redis_key
9
- "wayfarer-counter-#{batch}"
8
+ attr_reader :task
9
+
10
+ def initialize(task, &callback)
11
+ @task = task
12
+ @callback = callback
13
+ @redis_pool = task[:redis_pool]
10
14
  end
11
15
 
12
- def reset!
13
- redis { |conn| conn.del(redis_key) }
16
+ def redis_key
17
+ "wayfarer-counter-#{@task.batch}"
14
18
  end
15
19
 
16
20
  def value
17
- redis { |conn| conn.get(redis_key) }.to_i
21
+ redis_pool.with { |conn| conn.get(redis_key) }.to_i
18
22
  end
19
23
 
20
24
  def increment
21
- redis { |conn| conn.incr(redis_key) }
25
+ redis_pool.with { |conn| conn.incr(redis_key) }
22
26
  end
23
27
 
24
28
  def decrement
25
- redis { |conn| conn.decr(redis_key) }
29
+ redis_pool.with { |conn| conn.decr(redis_key) }.tap do |val|
30
+ @callback&.call if val == 0
31
+ end
26
32
  end
33
+
34
+ private
35
+
36
+ attr_reader :redis_pool
27
37
  end
28
38
  end
29
39
  end
@@ -10,7 +10,7 @@ module Wayfarer
10
10
 
11
11
  def initialize
12
12
  @pool = ConnectionPool.new do
13
- Wayfarer.config.redis.factory.call(Wayfarer.config.redis)
13
+ Wayfarer.config[:redis][:factory].call(Wayfarer.config[:redis])
14
14
  end
15
15
  end
16
16