wayfarer 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +29 -2
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +17 -0
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -31
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -42
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -26
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -6,12 +6,20 @@ module Wayfarer
6
6
  extend Base
7
7
 
8
8
  def call(task)
9
- controller = task.metadata.controller
9
+ controller = task[:controller]
10
10
 
11
- controller.run_callbacks(:action) do
12
- case action = task.metadata.action
11
+ task[:return_value] = controller.run_callbacks(:action) do
12
+ case action = task[:action]
13
13
  when Symbol then controller.public_send(action)
14
- else action.new.call(task)
14
+ when Array
15
+ handler, method = action
16
+ task[:action] = method
17
+ handler.new.call(task)
18
+ else
19
+ raise ArgumentError, "invalid action: #{action.inspect}" unless action&.include?(Wayfarer::Handler)
20
+
21
+ task[:action] = nil # TODO: Test
22
+ action.new.call(task)
15
23
  end
16
24
  end
17
25
 
@@ -4,23 +4,24 @@ module Wayfarer
4
4
  module Middleware
5
5
  class Normalize
6
6
  extend Base
7
+ include Wayfarer::Logging.emit(
8
+ invalid: [:info, "Failed to normalize HTTP(S) URL"]
9
+ )
7
10
 
8
- def call(task)
9
- yield if block_given?
11
+ def self.normalize(uri)
12
+ return uri.to_s unless %w[http https].include?(uri.scheme)
10
13
 
11
- task.metadata.staged_urls = SortedSet.new(normalized_urls(task).compact)
14
+ NormalizeUrl.process(uri)
15
+ rescue NormalizeUrl::InvalidURIError
16
+ nil
12
17
  end
13
18
 
14
- private
19
+ def call(task)
20
+ return (yield if block_given?) if task[:normalized_url]
15
21
 
16
- def normalized_urls(task)
17
- task.metadata.staged_urls.map(&method(:normalize))
18
- end
22
+ return log(:invalid, task) unless (task[:normalized_url] = self.class.normalize(task[:uri]))
19
23
 
20
- def normalize(url)
21
- NormalizeUrl.process(url)
22
- rescue NormalizeUrl::InvalidURIError
23
- nil
24
+ yield if block_given?
24
25
  end
25
26
  end
26
27
  end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class Redis
6
+ extend Base
7
+
8
+ def call(task)
9
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
10
+
11
+ yield if block_given?
12
+ end
13
+ end
14
+ end
15
+ end
@@ -5,51 +5,49 @@ module Wayfarer
5
5
  class Router
6
6
  extend Base
7
7
 
8
- module API
9
- def self.included(base)
10
- base.extend(ClassMethods)
11
- base.include(InstanceMethods)
12
- end
13
-
14
- module ClassMethods
15
- def router
16
- # TODO: Use cattr_accessor
17
- @router ||= Wayfarer::Routing::Router.new
18
- end
8
+ include Wayfarer::Logging.emit(
9
+ mismatch: [:info, "No matching route"],
10
+ match: [:info, "Routing to %<action>s"],
11
+ already_routed: [:debug, "Already routed to %<action>s"]
12
+ )
19
13
 
20
- def route(&block)
21
- router.draw(&block) if block_given?
22
- end
14
+ module API
15
+ extend ActiveSupport::Concern
23
16
 
24
- def steer(&block)
25
- define_method(:steer) { block.call(task) }
26
- end
17
+ included do
18
+ class_attribute :route,
19
+ default: Wayfarer::Routing::RootRoute.new,
20
+ instance_accessor: false,
21
+ instance_predicate: false
27
22
  end
28
23
 
29
- module InstanceMethods
30
- def steer
31
- []
32
- end
24
+ def action
25
+ task[:action]
26
+ end
33
27
 
34
- def params
35
- task.metadata.params
36
- end
28
+ def params
29
+ task[:params]
37
30
  end
38
31
  end
39
32
 
40
33
  def call(task)
41
- controller = task.metadata.controller
42
- # TODO: The router has to be cloned because it's not thread-safe
43
- router = controller.class.router.clone
44
- url = Addressable::URI.parse(task.url)
45
-
46
- case result = router.invoke(url, controller.steer)
47
- when Routing::Result::Mismatch
48
- return
34
+ # Avoid rerouting when dispatching a [Controller, :action] pair
35
+ if (action = task[:action])
36
+ log(:already_routed, task, action: action)
37
+
38
+ return (yield if block_given?)
39
+ end
40
+
41
+ case result = task[:controller].class.route.invoke(task[:uri])
42
+ when Routing::Result::Mismatch then return log(:mismatch, task)
49
43
  when Routing::Result::Match
50
- task.metadata.action = result.action
51
- task.metadata.params ||= ActiveSupport::HashWithIndifferentAccess.new
52
- task.metadata.params.merge!(result.params)
44
+ action = result.action
45
+
46
+ log(:match, task, action: action.inspect)
47
+
48
+ task[:action] = action
49
+ task[:params] ||= ActiveSupport::HashWithIndifferentAccess.new
50
+ task[:params].merge!(result.params)
53
51
  end
54
52
 
55
53
  yield if block_given?
@@ -7,20 +7,20 @@ module Wayfarer
7
7
 
8
8
  module API
9
9
  def stage(urls)
10
- Array.wrap(urls).each { |url| task.metadata.staged_urls.add(url.to_s) }
10
+ Array.wrap(urls).each { |url| task[:staged_urls].add(url.to_s) }
11
11
  end
12
12
  end
13
13
 
14
14
  def call(task)
15
- task.metadata.staged_urls = SortedSet.new
15
+ task[:staged_urls] = Set.new
16
16
 
17
17
  yield if block_given?
18
18
 
19
- task.metadata.staged_urls.each do |url|
20
- task.metadata.job.class.crawl(url, batch: task.batch)
19
+ task[:staged_urls].each do |url|
20
+ task[:job].class.crawl(url, batch: task.batch)
21
21
  end
22
22
 
23
- task.metadata.staged_urls.clear
23
+ task[:staged_urls].clear
24
24
  end
25
25
  end
26
26
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class UriParser
6
+ extend Base
7
+
8
+ include Wayfarer::Logging.emit(
9
+ invalid: [:info, "Not processing invalid URL (%<message>s)"]
10
+ )
11
+
12
+ module API
13
+ def uri
14
+ task[:uri]
15
+ end
16
+ end
17
+
18
+ def call(task)
19
+ # TODO: Test
20
+ task[:uri] ||= begin
21
+ Addressable::URI.parse(task.url).normalize
22
+ rescue Addressable::URI::InvalidURIError => e
23
+ return log(:invalid, task, message: e.message)
24
+ end
25
+
26
+ yield if block_given?
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Middleware
5
+ class UserAgent
6
+ extend Base
7
+
8
+ module API
9
+ def user_agent
10
+ task[:context]&.instance
11
+ end
12
+
13
+ def page(live: false)
14
+ return task[:page] unless live
15
+
16
+ task[:page] = task[:context].live&.page || task[:page]
17
+ end
18
+
19
+ def fetch(url, follow: 3)
20
+ (@http ||= Wayfarer::Networking::Follow.http).fetch(url, follow: follow)
21
+ end
22
+ end
23
+
24
+ def call(task)
25
+ pool.with do |context|
26
+ task[:context] = context
27
+
28
+ result = task[:controller].run_callbacks(:fetch) do
29
+ context.fetch(task.url)
30
+ end
31
+
32
+ case result
33
+ when Networking::Result::Redirect
34
+ task[:controller].stage(result.redirect_url)
35
+ when Networking::Result::Success
36
+ task[:page] = result.page
37
+ yield if block_given?
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def pool
45
+ Wayfarer::Networking::Pool.instance
46
+ end
47
+ end
48
+ end
49
+ end
@@ -6,7 +6,7 @@ module Wayfarer
6
6
  include Strategy
7
7
 
8
8
  def create
9
- ::Capybara::Session.new(Wayfarer.config.capybara.driver, nil)
9
+ ::Capybara::Session.new(Wayfarer.config[:capybara][:driver], nil)
10
10
  end
11
11
 
12
12
  def destroy(instance)
@@ -25,8 +25,8 @@ module Wayfarer
25
25
 
26
26
  def supervise
27
27
  yield
28
- rescue *strategy.renew_on => e
29
- renew
28
+ rescue *strategy.renew_on, *Wayfarer.config[:network][:renew_on] => e
29
+ renew # may raise
30
30
  ensure
31
31
  # If renewing raises, re-raise the originally caught exception
32
32
  # TODO: Not nice this effectively swallows exceptions
@@ -10,8 +10,8 @@ module Wayfarer
10
10
  end
11
11
 
12
12
  def create
13
- ::Ferrum::Browser.new(Wayfarer.config.ferrum.options).tap do |browser|
14
- browser.headers.set(Wayfarer.config.network.http_headers)
13
+ ::Ferrum::Browser.new(Wayfarer.config[:ferrum][:options]).tap do |browser|
14
+ browser.headers.set(Wayfarer.config[:network][:http_headers])
15
15
  end
16
16
  end
17
17
 
@@ -2,15 +2,21 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Networking
5
- RedirectsExhaustedError = Class.new(StandardError)
5
+ class Follow
6
+ RedirectsExhaustedError = Class.new(StandardError)
6
7
 
7
- Follow = Struct.new(:context) do
8
- extend Forwardable
8
+ def self.http
9
+ new(Wayfarer::Networking::Context.new(Wayfarer::Networking::HTTP.new))
10
+ end
11
+
12
+ attr_reader :context
9
13
 
10
- delegate %i[live renew instance] => :context
14
+ def initialize(context)
15
+ @context = context
16
+ end
11
17
 
12
- def fetch(url, follow: 3)
13
- raise RedirectsExhaustedError if follow.negative?
18
+ def fetch(url, follow:)
19
+ raise RedirectsExhaustedError if follow < 0
14
20
 
15
21
  case result = context.fetch(url)
16
22
  when Result::Success then result.page
@@ -9,7 +9,7 @@ module Wayfarer
9
9
 
10
10
  def create
11
11
  Net::HTTP::Persistent.new(name: CONNECTION_NAME).tap do |conn|
12
- Wayfarer.config.network.http_headers.each do |key, val|
12
+ Wayfarer.config[:network][:http_headers].each do |key, val|
13
13
  conn.override_headers[key] = val
14
14
  end
15
15
  end
@@ -5,25 +5,30 @@ module Wayfarer
5
5
  class Pool
6
6
  include Singleton
7
7
 
8
- cattr_accessor :registry, default: { http: HTTP,
9
- ferrum: Ferrum,
10
- selenium: Selenium,
11
- capybara: Capybara }
12
-
13
- def pool
14
- @pool ||= ConnectionPool.new(size: Wayfarer.config.network.pool_size,
15
- timeout: Wayfarer.config.network.pool_timeout,
16
- &method(:context))
8
+ class_attribute :registry,
9
+ default: { http: HTTP,
10
+ ferrum: Ferrum,
11
+ selenium: Selenium,
12
+ capybara: Capybara },
13
+ instance_accessor: false,
14
+ instance_predicate: false
15
+
16
+ def initialize
17
+ @pool = ConnectionPool.new(size: Wayfarer.config[:network][:pool_size],
18
+ timeout: Wayfarer.config[:network][:pool_timeout],
19
+ &method(:context))
20
+
21
+ at_exit { free }
17
22
  end
18
23
 
19
24
  def with(&block)
20
- pool.with(&block)
25
+ @pool.with(&block)
21
26
  rescue ConnectionPool::TimeoutError => e
22
27
  raise Wayfarer::UserAgentTimeoutError, e
23
28
  end
24
29
 
25
30
  def free
26
- pool.shutdown(&:renew)
31
+ @pool.shutdown(&:renew)
27
32
  end
28
33
 
29
34
  private
@@ -33,7 +38,7 @@ module Wayfarer
33
38
  end
34
39
 
35
40
  def strategy
36
- self.class.registry[Wayfarer.config.network.agent].new
41
+ self.class.registry[Wayfarer.config[:network][:agent]].new
37
42
  end
38
43
  end
39
44
  end
@@ -9,7 +9,7 @@ module Wayfarer
9
9
  MOCK_RESPONSE_HEADERS = {}.freeze
10
10
 
11
11
  def create
12
- ::Selenium::WebDriver.for(Wayfarer.config.selenium.driver, **options)
12
+ ::Selenium::WebDriver.for(Wayfarer.config[:selenium][:driver], **options)
13
13
  end
14
14
 
15
15
  def destroy(instance)
@@ -30,12 +30,12 @@ module Wayfarer
30
30
  private
31
31
 
32
32
  def options
33
- Wayfarer.config.selenium.options.merge(http_client: http_client)
33
+ Wayfarer.config[:selenium][:options].merge(http_client: http_client)
34
34
  end
35
35
 
36
36
  def http_client
37
37
  ::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
38
- client.read_timeout = Wayfarer.config.selenium.client_timeout
38
+ client.read_timeout = Wayfarer.config[:selenium][:client_timeout]
39
39
  end
40
40
  end
41
41
  end
@@ -13,13 +13,13 @@ module Wayfarer
13
13
  end
14
14
 
15
15
  def navigate(_instance, _url)
16
- raise NoMethodError
16
+ raise NotImplementedError
17
17
  end
18
18
 
19
19
  def live(_instance); end
20
20
 
21
21
  def create
22
- raise NoMethodError
22
+ raise NotImplementedError
23
23
  end
24
24
 
25
25
  def destroy(_instance); end
data/lib/wayfarer/page.rb CHANGED
@@ -1,12 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
+ # @!attribute [r] url
5
+ # @return [String] the URL that was fetched
6
+ # @!attribute [r] status_code
7
+ # @return [Fixnum] HTTP status code
8
+ # @!attribute [r] body
9
+ # @return [String] the body of the response
10
+ # @!attribute [r] headers
11
+ # @return [Hash] the headers of the response
12
+ # @note HTTP header keys are downcased, for example: `content-type`.
4
13
  class Page
5
14
  attr_reader :url,
6
15
  :status_code,
7
16
  :body,
8
17
  :headers
9
18
 
19
+ # @!visibility private
10
20
  def initialize(url:, status_code:, body:, headers:)
11
21
  @url = url
12
22
  @status_code = status_code
@@ -14,24 +24,36 @@ module Wayfarer
14
24
  @headers = headers.transform_keys(&:downcase)
15
25
  end
16
26
 
17
- def doc
18
- return @doc if @doc
19
-
20
- # If no Content-Type field is present, assume HTML/XML
21
- return @doc = Wayfarer::Parsing::XML.parse_html(body) unless headers["content-type"]
22
-
23
- content_type = headers["content-type"]
24
- sub_type = MIME::Types[content_type].first.sub_type
27
+ # Returns the MIME type of the response.
28
+ # @return [MIME::Type]
29
+ # @see https://www.rubydoc.info/gems/mime-types/MIME/Type
30
+ def mime_type
31
+ @mime_type ||= MIME::Types[content_type]&.first
32
+ end
25
33
 
26
- @doc = case sub_type
27
- when "json" then Wayfarer::Parsing::JSON.parse(body)
28
- when "xml" then Wayfarer::Parsing::XML.parse_xml(body)
29
- else Wayfarer::Parsing::XML.parse_html(body)
30
- end
34
+ # Returns a parsed representation of the HTTP response or the browser DOM,
35
+ # depending on the Content-Type.
36
+ # @return [Nokogiri::HTML::Document] when Content-Type is `text/html`
37
+ # @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document Nokogiri::HTML::Document
38
+ # @return [Nokogiri::XML::Document] when Content-Type is `text/xml`
39
+ # @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document Nokogiri::XML::Document
40
+ # @return [Hash] when Content-Type is `application/json`
41
+ # @note You can register custom parsers with {Wayfarer::Parsing.registry}.
42
+ def doc
43
+ @doc ||= Wayfarer::Parsing.parse(body, mime_type&.content_type || content_type)
31
44
  end
32
45
 
46
+ # Returns a `MetaInspector::Document`.
47
+ # @return [MetaInspector::Document]
48
+ # @see https://www.rubydoc.info/gems/metainspector/MetaInspector/Document
33
49
  def meta
34
- @meta ||= MetaInspector.new(url, document: body)
50
+ @meta ||= MetaInspector.new(url, document: body, headers: headers, normalize_url: false)
51
+ end
52
+
53
+ private
54
+
55
+ def content_type
56
+ @content_type ||= headers["content-type"]
35
57
  end
36
58
  end
37
59
  end
@@ -5,12 +5,12 @@ module Wayfarer
5
5
  module XML
6
6
  module_function
7
7
 
8
- def parse_xml(xml)
9
- Nokogiri::XML(xml)
10
- end
11
-
12
- def parse_html(html)
13
- Nokogiri::HTML(html)
8
+ def parse(xml, variant)
9
+ case variant
10
+ when :xml then Nokogiri::XML(xml)
11
+ when :html then Nokogiri::HTML(xml)
12
+ else raise ArgumentError, "Unknown type: #{type}"
13
+ end
14
14
  end
15
15
  end
16
16
  end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # @!scope class
5
+ # @!attribute [r] registry
6
+ # @return [Hash] Mapping of Content-Type to parser.
7
+ module Parsing
8
+ # @!visibility private
9
+ FALLBACK_CONTENT_TYPE = "application/octet-stream"
10
+
11
+ mattr_accessor :registry, default: { "application/json" => JSON,
12
+ "text/html" => [XML, :html],
13
+ "application/xml" => [XML, :xml] }
14
+
15
+ module_function
16
+
17
+ # @!visibility private
18
+ def parse(body, content_type = FALLBACK_CONTENT_TYPE)
19
+ parser, args = registry[content_type] || return
20
+
21
+ parser.parse(body, *args)
22
+ end
23
+ end
24
+ end
@@ -2,35 +2,27 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Redis
5
- Barrier = Struct.new(:batch) do
6
- include Connection
5
+ class Barrier
6
+ include Resettable
7
7
 
8
- def redis_key
9
- "wayfarer-barrier-#{batch}"
10
- end
8
+ attr_reader :task
11
9
 
12
- def reset!
13
- redis { |conn| conn.del(redis_key) }
10
+ def initialize(task)
11
+ @task = task
12
+ @redis_pool = task[:redis_pool]
14
13
  end
15
14
 
16
- def seen?(url)
17
- !redis { |conn| conn.sadd(redis_key, url) }
15
+ def redis_key
16
+ "wayfarer-barrier-#{task.batch}"
18
17
  end
19
18
 
20
- def peek(urls)
21
- major, minor, = Version.determine
22
-
23
- # SMISMEMBER is only supported on Redis >= 6.2.0
24
- if major > 6 || (major == 6 && minor >= 2)
25
- redis { |conn| conn.smismember(redis_key, urls) }.map { |val| val == 1 }
26
- else
27
- urls.map { |url| redis { |conn| conn.sismember(redis_key, url) } }
28
- end
19
+ def check!(url)
20
+ !redis_pool.with { |conn| conn.hsetnx(redis_key, url, "") }
29
21
  end
30
22
 
31
- def unsee(url)
32
- redis { |conn| conn.srem(redis_key, url) }
33
- end
23
+ private
24
+
25
+ attr_reader :redis_pool
34
26
  end
35
27
  end
36
28
  end
@@ -2,28 +2,38 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Redis
5
- Counter = Struct.new(:batch) do
6
- include Connection
5
+ class Counter
6
+ include Resettable
7
7
 
8
- def redis_key
9
- "wayfarer-counter-#{batch}"
8
+ attr_reader :task
9
+
10
+ def initialize(task, &callback)
11
+ @task = task
12
+ @callback = callback
13
+ @redis_pool = task[:redis_pool]
10
14
  end
11
15
 
12
- def reset!
13
- redis { |conn| conn.del(redis_key) }
16
+ def redis_key
17
+ "wayfarer-counter-#{@task.batch}"
14
18
  end
15
19
 
16
20
  def value
17
- redis { |conn| conn.get(redis_key) }.to_i
21
+ redis_pool.with { |conn| conn.get(redis_key) }.to_i
18
22
  end
19
23
 
20
24
  def increment
21
- redis { |conn| conn.incr(redis_key) }
25
+ redis_pool.with { |conn| conn.incr(redis_key) }
22
26
  end
23
27
 
24
28
  def decrement
25
- redis { |conn| conn.decr(redis_key) }
29
+ redis_pool.with { |conn| conn.decr(redis_key) }.tap do |val|
30
+ @callback&.call if val == 0
31
+ end
26
32
  end
33
+
34
+ private
35
+
36
+ attr_reader :redis_pool
27
37
  end
28
38
  end
29
39
  end
@@ -10,7 +10,7 @@ module Wayfarer
10
10
 
11
11
  def initialize
12
12
  @pool = ConnectionPool.new do
13
- Wayfarer.config.redis.factory.call(Wayfarer.config.redis)
13
+ Wayfarer.config[:redis][:factory].call(Wayfarer.config[:redis])
14
14
  end
15
15
  end
16
16