wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
@@ -9,7 +9,7 @@ module Wayfarer
9
9
 
10
10
  def create
11
11
  Net::HTTP::Persistent.new(name: CONNECTION_NAME).tap do |conn|
12
- Wayfarer.config.network.http_headers.each do |key, val|
12
+ Wayfarer.config[:network][:http_headers].each do |key, val|
13
13
  conn.override_headers[key] = val
14
14
  end
15
15
  end
@@ -5,25 +5,35 @@ module Wayfarer
5
5
  class Pool
6
6
  include Singleton
7
7
 
8
- cattr_accessor :registry, default: { http: HTTP,
9
- ferrum: Ferrum,
10
- selenium: Selenium,
11
- capybara: Capybara }
12
-
13
- def pool
14
- @pool ||= ConnectionPool.new(size: Wayfarer.config.network.pool_size,
15
- timeout: Wayfarer.config.network.pool_timeout,
16
- &method(:context))
8
+ class_attribute :finalizer,
9
+ default: ->(pool) { pool.shutdown(&:renew) }.freeze,
10
+ instance_accessor: false,
11
+ instance_predicate: false
12
+
13
+ class_attribute :registry,
14
+ default: { http: HTTP,
15
+ ferrum: Ferrum,
16
+ selenium: Selenium,
17
+ capybara: Capybara },
18
+ instance_accessor: false,
19
+ instance_predicate: false
20
+
21
+ attr_reader :pool
22
+
23
+ def initialize
24
+ @pool = ConnectionPool.new(**Wayfarer.config.dig(:network, :pool), &method(:context))
25
+
26
+ at_exit { free }
17
27
  end
18
28
 
19
- def with(&block)
20
- pool.with(&block)
29
+ def with(&)
30
+ @pool.with(&)
21
31
  rescue ConnectionPool::TimeoutError => e
22
32
  raise Wayfarer::UserAgentTimeoutError, e
23
33
  end
24
34
 
25
35
  def free
26
- pool.shutdown(&:renew)
36
+ self.class.finalizer.call(@pool)
27
37
  end
28
38
 
29
39
  private
@@ -33,7 +43,7 @@ module Wayfarer
33
43
  end
34
44
 
35
45
  def strategy
36
- self.class.registry[Wayfarer.config.network.agent].new
46
+ self.class.registry[Wayfarer.config.dig(:network, :agent)].new
37
47
  end
38
48
  end
39
49
  end
@@ -9,7 +9,7 @@ module Wayfarer
9
9
  MOCK_RESPONSE_HEADERS = {}.freeze
10
10
 
11
11
  def create
12
- ::Selenium::WebDriver.for(Wayfarer.config.selenium.driver, **options)
12
+ ::Selenium::WebDriver.for(driver, options)
13
13
  end
14
14
 
15
15
  def destroy(instance)
@@ -29,15 +29,23 @@ module Wayfarer
29
29
 
30
30
  private
31
31
 
32
- def options
33
- Wayfarer.config.selenium.options.merge(http_client: http_client)
32
+ def driver
33
+ Wayfarer.config.dig(:selenium, :driver)
34
34
  end
35
35
 
36
- def http_client
37
- ::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
38
- client.read_timeout = Wayfarer.config.selenium.client_timeout
39
- end
36
+ def options
37
+ Wayfarer.config.dig(:selenium, :options)
40
38
  end
39
+
40
+ # def options
41
+ # Wayfarer.config[:selenium][:options].merge(http_client: http_client)
42
+ # end
43
+
44
+ # def http_client
45
+ # ::Selenium::WebDriver::Remote::Http::Default.new.tap do |client|
46
+ # client.read_timeout = Wayfarer.config[:selenium][:client_timeout]
47
+ # end
48
+ # end
41
49
  end
42
50
  end
43
51
  end
@@ -13,13 +13,13 @@ module Wayfarer
13
13
  end
14
14
 
15
15
  def navigate(_instance, _url)
16
- raise NoMethodError
16
+ raise NotImplementedError
17
17
  end
18
18
 
19
19
  def live(_instance); end
20
20
 
21
21
  def create
22
- raise NoMethodError
22
+ raise NotImplementedError
23
23
  end
24
24
 
25
25
  def destroy(_instance); end
data/lib/wayfarer/page.rb CHANGED
@@ -1,12 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
+ # @!attribute [r] url
5
+ # @return [String] the URL that was fetched
6
+ # @!attribute [r] status_code
7
+ # @return [Fixnum] HTTP status code
8
+ # @!attribute [r] body
9
+ # @return [String] the body of the response
10
+ # @!attribute [r] headers
11
+ # @return [Hash] the headers of the response
12
+ # @note HTTP header keys are downcased, for example: `content-type`.
4
13
  class Page
5
14
  attr_reader :url,
6
15
  :status_code,
7
16
  :body,
8
17
  :headers
9
18
 
19
+ # @!visibility private
10
20
  def initialize(url:, status_code:, body:, headers:)
11
21
  @url = url
12
22
  @status_code = status_code
@@ -14,24 +24,34 @@ module Wayfarer
14
24
  @headers = headers.transform_keys(&:downcase)
15
25
  end
16
26
 
17
- def doc
18
- return @doc if @doc
19
-
20
- # If no Content-Type field is present, assume HTML/XML
21
- return @doc = Wayfarer::Parsing::XML.parse_html(body) unless headers["content-type"]
22
-
23
- content_type = headers["content-type"]
24
- sub_type = MIME::Types[content_type].first.sub_type
27
+ # Returns the MIME type of the response.
28
+ # @return [MIME::Type]
29
+ # @see https://www.rubydoc.info/gems/mime-types/MIME/Type
30
+ def mime_type
31
+ @mime_type ||= MIME::Types[content_type]&.first
32
+ end
25
33
 
26
- @doc = case sub_type
27
- when "json" then Wayfarer::Parsing::JSON.parse(body)
28
- when "xml" then Wayfarer::Parsing::XML.parse_xml(body)
29
- else Wayfarer::Parsing::XML.parse_html(body)
30
- end
34
+ # Returns a parsed representation of the HTTP response or the browser DOM,
35
+ # depending on the Content-Type.
36
+ # @return [Nokogiri::HTML::Document] when Content-Type is `text/html`
37
+ # @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document Nokogiri::HTML::Document
38
+ # @return [Nokogiri::XML::Document] when Content-Type is `text/xml`
39
+ # @see https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document Nokogiri::XML::Document
40
+ # @return [Hash] when Content-Type is `application/json`
41
+ # @note You can register custom parsers with {Wayfarer::Parsing.registry}.
42
+ def doc
43
+ @doc ||= Wayfarer::Parsing.parse(body, mime_type&.content_type || content_type)
31
44
  end
32
45
 
46
+ # Returns a `MetaInspector::Document`.
47
+ # @return [MetaInspector::Document]
48
+ # @see https://www.rubydoc.info/gems/metainspector/MetaInspector/Document
33
49
  def meta
34
- @meta ||= MetaInspector.new(url, document: body)
50
+ @meta ||= MetaInspector.new(url, document: body, headers: headers, normalize_url: false)
51
+ end
52
+
53
+ def content_type
54
+ @content_type ||= headers["content-type"]
35
55
  end
36
56
  end
37
57
  end
@@ -5,12 +5,12 @@ module Wayfarer
5
5
  module XML
6
6
  module_function
7
7
 
8
- def parse_xml(xml)
9
- Nokogiri::XML(xml)
10
- end
11
-
12
- def parse_html(html)
13
- Nokogiri::HTML(html)
8
+ def parse(xml, variant)
9
+ case variant
10
+ when :xml then Nokogiri::XML(xml)
11
+ when :html then Nokogiri::HTML(xml)
12
+ else raise ArgumentError, "Unknown type: #{variant}"
13
+ end
14
14
  end
15
15
  end
16
16
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # @!scope class
5
+ # @!attribute [r] registry
6
+ # @return [Hash] Mapping of Content-Type to parser.
7
+ module Parsing
8
+ # @!visibility private
9
+ FALLBACK_CONTENT_TYPE = "application/octet-stream"
10
+
11
+ module_function
12
+
13
+ # @!visibility private
14
+ def parse(body, content_type = FALLBACK_CONTENT_TYPE)
15
+ parser, args = Wayfarer.config.dig(:parsing, :registry, content_type)
16
+ return unless parser
17
+
18
+ parser.parse(body, *args)
19
+ end
20
+ end
21
+ end
@@ -2,35 +2,40 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Redis
5
- Barrier = Struct.new(:batch) do
6
- include Connection
5
+ # A Barrier prevents processing the same key more than once.
6
+ # It marks keys in a Redis hash as they are encountered.
7
+ # Once a key is added, it cannot get removed.
8
+ class Barrier
9
+ include Resettable
7
10
 
8
- def redis_key
9
- "wayfarer-barrier-#{batch}"
10
- end
11
+ # @return [Hash] the task configuration, including redis_pool and batch name
12
+ attr_reader :task
11
13
 
12
- def reset!
13
- redis { |conn| conn.del(redis_key) }
14
- end
14
+ VALUE = ""
15
15
 
16
- def seen?(url)
17
- !redis { |conn| conn.sadd(redis_key, url) }
16
+ # @param task [Wayfarer::Task] task context
17
+ def initialize(task)
18
+ @task = task
19
+ @redis_pool = task[:redis_pool]
18
20
  end
19
21
 
20
- def peek(urls)
21
- major, minor, = Version.determine
22
-
23
- # SMISMEMBER is only supported on Redis >= 6.2.0
24
- if major > 6 || (major == 6 && minor >= 2)
25
- redis { |conn| conn.smismember(redis_key, urls) }.map { |val| val == 1 }
26
- else
27
- urls.map { |url| redis { |conn| conn.sismember(redis_key, url) } }
28
- end
22
+ # @return [String] the Redis key for this barrier
23
+ def redis_key
24
+ "wayfarer-barrier-#{task.batch}"
29
25
  end
30
26
 
31
- def unsee(url)
32
- redis { |conn| conn.srem(redis_key, url) }
27
+ # Checks if a key has already been passed through the barrier.
28
+ #
29
+ # @param key [String] the key to check
30
+ # @return [Boolean] true if the key has already been seen, false otherwise
31
+ def check!(key)
32
+ !redis_pool.with { |conn| conn.hsetnx(redis_key, key, VALUE) }
33
33
  end
34
+
35
+ private
36
+
37
+ # @return [ConnectionPool] the Redis connection pool
38
+ attr_reader :redis_pool
34
39
  end
35
40
  end
36
41
  end
@@ -2,28 +2,37 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Redis
5
- Counter = Struct.new(:batch) do
6
- include Connection
5
+ class Counter
6
+ include Resettable
7
7
 
8
- def redis_key
9
- "wayfarer-counter-#{batch}"
8
+ attr_reader :task
9
+
10
+ def initialize(task)
11
+ @task = task
12
+ @redis_pool = task[:redis_pool]
10
13
  end
11
14
 
12
- def reset!
13
- redis { |conn| conn.del(redis_key) }
15
+ def redis_key
16
+ "wayfarer-counter-#{@task.batch}"
14
17
  end
15
18
 
16
19
  def value
17
- redis { |conn| conn.get(redis_key) }.to_i
20
+ redis_pool.with { |conn| conn.get(redis_key) }.to_i
18
21
  end
19
22
 
20
23
  def increment
21
- redis { |conn| conn.incr(redis_key) }
24
+ redis_pool.with { |conn| conn.incr(redis_key) }
22
25
  end
23
26
 
24
27
  def decrement
25
- redis { |conn| conn.decr(redis_key) }
28
+ redis_pool.with { |conn| conn.decr(redis_key) }.tap do |val|
29
+ @callback&.call if val == 0
30
+ end
26
31
  end
32
+
33
+ private
34
+
35
+ attr_reader :redis_pool
27
36
  end
28
37
  end
29
38
  end
@@ -10,7 +10,7 @@ module Wayfarer
10
10
 
11
11
  def initialize
12
12
  @pool = ConnectionPool.new do
13
- Wayfarer.config.redis.factory.call(Wayfarer.config.redis)
13
+ Wayfarer.config[:redis][:factory].call(Wayfarer.config[:redis])
14
14
  end
15
15
  end
16
16
 
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Redis
5
+ module Resettable
6
+ def reset!
7
+ redis_pool.with { |conn| conn.del(redis_key) }
8
+ end
9
+
10
+ def redis_pool
11
+ raise NotImplementedError
12
+ end
13
+
14
+ def redis_key
15
+ raise NotImplementedError
16
+ end
17
+ end
18
+ end
19
+ end
@@ -2,56 +2,192 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Routing
5
+ # Routing DSL that declares a tree of {Route}s. Routing trees decide
6
+ # whether a URL gets processed. Each route has a matcher which is a
7
+ # predicate. Routes are searched depth-first by a {PathFinder}.
8
+ #
9
+ # When you call a DSL method on a route, you declare a child route on the
10
+ # route. You can pass a keyword list and a block when you call DSL methods.
11
+ # Keyword lists result in route chains. For example, if you declare:
12
+ #
13
+ # ```ruby
14
+ # route.host "example.com", path: ":foo", query: { page: 1 }
15
+ # ```
16
+ #
17
+ # The last route with the query matcher is returned, and the hierarchy is
18
+ # `host > path > query`.
19
+ #
20
+ # To append multiple child rules, pass a block within which you can call
21
+ # DSL methods to declare child routes:
22
+ #
23
+ # ```ruby
24
+ # route.host "example.com" do
25
+ # path ":foo"
26
+ # query page: 1
27
+ # end
28
+ # ```
29
+ #
30
+ # The query matcher is returned again, but the hierarchy is
31
+ # `host > path`, `host > query`.
32
+ #
33
+ # @see Route
34
+ # @see PathFinder
5
35
  module DSL
6
- def url(url, options = {}, &block)
7
- add_child_route(Matchers::URL.new(url), path_offset, options, &block)
36
+ # Match URLs exactly.
37
+ #
38
+ # A trailing slash in +url+ is ignored so
39
+ # "https://example.com" and "https://example.com/" are equivalent.
40
+ #
41
+ # This matcher doesn't collect `params`.
42
+ #
43
+ # @param url [String]
44
+ # @param options [Hash]
45
+ # @yield [route]
46
+ # @return [Wayfarer::Routing::Route]
47
+ #
48
+ # @example Match URL exactly
49
+ # route.url("https://www.iana.org/help/example-domains").to(:index)
50
+ def url(url, **, &)
51
+ child_route(Matchers::URL.new(url), **, &)
8
52
  end
9
53
 
10
- def host(host, options = {}, &block)
11
- add_child_route(Matchers::Host.new(host), path_offset, options, &block)
54
+ # Match hostnames excluding the port number.
55
+ #
56
+ # * `String` is compared literally
57
+ # * `Regexp` is matched against the host
58
+ #
59
+ # This matcher doesn't collect `params`.
60
+ #
61
+ # @param host [String, Regexp]
62
+ # @param options [Hash]
63
+ # @yield [route]
64
+ # @return [Wayfarer::Routing::Route]
65
+ #
66
+ # @example Literal host
67
+ # route.host("example.com").to(:home)
68
+ # @example Regular expression
69
+ # route.host(/example\.com/) do
70
+ # path "users/:id", to: :user
71
+ # end
72
+ def host(host, **, &)
73
+ child_route(Matchers::Host.new(host), **, &)
12
74
  end
13
75
 
14
- def path(path, options = {}, &block)
15
- offset = File.join(path_offset, path)
16
- add_child_route(nil, offset, options, &block).tap do |route|
17
- route.matcher = Matchers::Path.new(offset, route)
18
- end
19
- end
76
+ # Match and consume path fragments. You can use Sinatra-style pattern
77
+ # matching to extract data from segments.
78
+ #
79
+ # @see https://github.com/sinatra/mustermann/blob/main/mustermann/README.md#-sinatra-pattern
80
+ #
81
+ # This matcher doesn't collect `params`.
82
+ # A leading slash is enforced on the path string.
83
+ #
84
+ # @example Capture segment
85
+ # route.path(":segment").to(:show)
86
+ # @example Nested paths
87
+ # route.path("/").path("foo").path(":id").to(:detail)
88
+ # @param path [String]
89
+ # @param options [Hash]
90
+ # @yield [route]
91
+ # @return [Wayfarer::Routing::Route]
92
+ def path(path, **, &)
93
+ path = File.join(File::SEPARATOR, path)
20
94
 
21
- def query(fields, options = {}, &block)
22
- add_child_route(Matchers::Query.new(fields), path_offset, options, &block)
95
+ child_route(Matchers::Path.new(path), **, &)
23
96
  end
24
97
 
25
- def scheme(scheme, options = {}, &block)
26
- add_child_route(Matchers::Scheme.new(scheme), path_offset, options, &block)
98
+ # Match query parameters.
99
+ #
100
+ # Each key/value pair must be present *at least once*; if multiple values
101
+ # occur the last one wins (like Rack).
102
+ #
103
+ # Supported value types:
104
+ #
105
+ # * `String` - exact match
106
+ # * `Regexp` - regular expression match
107
+ # * `Integer` - exact numeric match
108
+ # * `Range` - inclusive numeric range
109
+ #
110
+ # This matcher doesn't collect `params`.
111
+ #
112
+ # @param fields [Hash{Symbol,String => String,Regexp,Integer,Range}]
113
+ # @param options [Hash]
114
+ # @yield [route]
115
+ # @return [Wayfarer::Routing::Route]
116
+ #
117
+ # @example Simple parameter
118
+ # route.query(foo: "bar").to(:index)
119
+ # @example Page range
120
+ # route.query(page: 5..12).to(:index)
121
+ def query(fields, **, &)
122
+ child_route(Matchers::Query.new(fields), **, &)
27
123
  end
28
124
 
29
- def suffix(suffix, options = {}, &block)
30
- add_child_route(Matchers::Suffix.new(suffix), path_offset, options, &block)
125
+ # Match URL schemes (protocols).
126
+ #
127
+ # This matcher doesn't collect `params`.
128
+ #
129
+ # @param scheme [String, Symbol]
130
+ # @param options [Hash]
131
+ # @yield [route]
132
+ # @return [Wayfarer::Routing::Route]
133
+ #
134
+ # @example HTTPS vs HTTP
135
+ # route.scheme(:https).to(:tls)
136
+ # route.scheme(:http).to(:plain)
137
+ def scheme(scheme, **, &)
138
+ child_route(Matchers::Scheme.new(scheme), **, &)
31
139
  end
32
140
 
33
- def to(action, options = {}, &block)
34
- add_child_route(Matchers::Custom.new { true }, path_offset, TargetRoute, options, &block).tap do |route|
35
- route.action = action
36
- end
141
+ # Declares the action for the current route branch. An action is a symbol
142
+ # for an instance method, or a {Wayfarer::Handler}.
143
+ #
144
+ # In case of conflicting actions for a matching route path, the last
145
+ # matched action takes precedence.
146
+ #
147
+ # @param action [Symbol, Wayfarer::Handler] method or {Handler} to call.
148
+ # @param options [Hash]
149
+ # @yield [route]
150
+ # @return [Wayfarer::Routing::TargetRoute]
151
+ #
152
+ # @example Last action wins
153
+ # route.to(:alpha).to(:beta) # => routes to :beta
154
+ def to(action, **, &)
155
+ child_route(nil, action: action, klass: TargetRoute, **, &)
37
156
  end
38
157
 
39
- def custom(delegate, options = {}, &block)
40
- add_child_route(Matchers::Custom.new(delegate), path_offset, options, &block)
158
+ # Match URLs dynamically by declaring a route from a block during route
159
+ # evaluation. Custom matchers are passed a transient root route which will
160
+ # be followed. Custom matchers match when their dynamically declared
161
+ # subtree matches the URL.
162
+ #
163
+ # @param options [Hash]
164
+ # @yield [root, uri, task]
165
+ # @yieldparam [Wayfarer::Routing::RootRoute] root route to populate
166
+ # @yieldparam uri [Addressable::URI] parsed task URL
167
+ # @yieldparam task [Wayfarer::Task] current task
168
+ # @return [Wayfarer::Routing::Route]
169
+ #
170
+ # @example Batch routing
171
+ # route.custom do |root, _uri, task|
172
+ # database_record = Crawl.find_by(batch: task.batch)
173
+ # root.host(database_record.hostname_to_crawl).to(:index)
174
+ # end
175
+ def custom(**, &block)
176
+ child_route(Matchers::Custom.new(block), klass: SubRoute, **)
41
177
  end
42
178
 
43
179
  private
44
180
 
45
- # rubocop:disable Style/OptionalArguments
46
- def add_child_route(matcher, path_offset, klass = Route, options, &block)
47
- klass.new(matcher, path_offset, &block).tap do |route|
48
- route.parent = self
49
- leaf = options.reduce(route) { |acc, (key, val)| acc.public_send(key, val) }
50
- children.push(route)
51
- Docile.dsl_eval(leaf, &block) if block_given?
181
+ # @param matcher [Wayfarer::Routing::Matcher, nil]
182
+ # @param klass [Class<Route>]
183
+ # @param options [Hash]
184
+ # @yield [route]
185
+ # @return [Wayfarer::Routing::Route]
186
+ def child_route(matcher, klass: Route, **, &)
187
+ klass.new(matcher: matcher, parent: self, **, &).tap do |route|
188
+ children.append(route)
52
189
  end
53
190
  end
54
- # rubocop:enable Style/OptionalArguments
55
191
  end
56
192
  end
57
193
  end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Routing
5
+ class HashStack
6
+ EmptyStackError = Class.new(StandardError)
7
+
8
+ def self.empty
9
+ new(Route::EMPTY_PARAMS)
10
+ end
11
+
12
+ def initialize(initial_state)
13
+ @stack = [initial_state]
14
+ end
15
+
16
+ def push(hash)
17
+ stack.push(stack.last.dup.merge!(hash))
18
+ end
19
+
20
+ def pop
21
+ stack.pop || raise(EmptyStackError)
22
+ end
23
+
24
+ def to_h
25
+ stack.last
26
+ end
27
+
28
+ private
29
+
30
+ attr_reader :stack
31
+ end
32
+ end
33
+ end
@@ -5,19 +5,22 @@ module Wayfarer
5
5
  module Matchers
6
6
  class Custom
7
7
  include Stringify
8
+ include EmptyParams
8
9
 
9
10
  attr_reader :delegate
10
11
 
11
- def initialize(delegate = proc)
12
+ def initialize(delegate)
12
13
  @delegate = delegate
13
14
  end
14
15
 
15
- def match(url)
16
- !!delegate.call(url)
16
+ def evaluate(path_finder)
17
+ Wayfarer::Routing::RootRoute.new.tap do |route|
18
+ delegate.call(route, path_finder.uri, path_finder.task)
19
+ end
17
20
  end
18
21
 
19
- def params(_)
20
- {}
22
+ def to_h
23
+ { custom: delegate.class.name }
21
24
  end
22
25
  end
23
26
  end