wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
@@ -5,42 +5,89 @@ module Wayfarer
5
5
  class Route
6
6
  include DSL
7
7
  include Stringify
8
+ include Serializable
8
9
 
9
- attr_reader :children
10
+ attr_reader :matcher,
11
+ :parent,
12
+ :children
10
13
 
11
- attr_accessor :matcher,
12
- :parent,
13
- :action,
14
- :path_offset
14
+ stringify :matcher
15
15
 
16
- stringify :matcher,
17
- :action,
18
- :path_offset,
19
- :children
16
+ EMPTY_PARAMS = {}.freeze
20
17
 
21
- def initialize(matcher = Matchers::Custom.new { children.any? }, path_offset = "/")
18
+ def initialize(
19
+ parent: nil,
20
+ matcher: nil,
21
+ action: nil,
22
+ **options,
23
+ &block
24
+ )
25
+ raise "missing parent" unless parent || is_a?(RootRoute)
26
+
27
+ @parent = parent
22
28
  @matcher = matcher
29
+ @action = action
30
+
23
31
  @children = []
24
- @path_offset = path_offset
32
+
33
+ leaf = options.reduce(self) { |acc, (key, val)| acc.public_send(key, val) }
34
+ leaf.instance_eval(&block) if block
25
35
  end
26
36
 
27
- def match(url)
28
- matcher.match(url)
37
+ # @return [true, false]
38
+ def root?
39
+ parent.nil?
29
40
  end
30
41
 
31
- def matches?(url)
32
- invoke(url).is_a?(Result::Match)
42
+ # @return [true, false]
43
+ def leaf?
44
+ children.empty?
33
45
  end
34
46
 
35
- def invoke(url)
36
- PathFinder.result(self, url)
47
+ # @return [false]
48
+ def target?
49
+ false
37
50
  end
38
51
 
39
- # Accepts a visitor for in-order traversal.
52
+ # Accepts a visitor for pre-order traversal.
40
53
  def accept(visitor)
41
- return unless visitor.visit(self)
54
+ visitor.enter(self)
55
+
56
+ return visitor.leave unless visitor.visit(self)
42
57
 
43
58
  children.each { |child| child.accept(visitor) }
59
+
60
+ visitor.leave
61
+ end
62
+
63
+ # @param [path_finder] PathFinder
64
+ # @return [Hash]
65
+ def params(path_finder)
66
+ matcher&.params(path_finder) || EMPTY_PARAMS
67
+ end
68
+
69
+ # @param [_path_finder] PathFinder
70
+ # @return [nil, Symbol, Wayfarer::Handler]
71
+ def action(_path_finder)
72
+ @action
73
+ end
74
+
75
+ # @param [path_finder] PathFinder
76
+ # @return [Result::Match, Result::Mismatch, Object]
77
+ def match(path_finder)
78
+ evaluate(path_finder)
79
+ end
80
+
81
+ # @param [path_finder] PathFinder
82
+ # @return [true, false, Wayfarer::Routing::Route]
83
+ def evaluate(path_finder)
84
+ matcher.evaluate(path_finder)
85
+ end
86
+
87
+ def to_h
88
+ return {} unless matcher
89
+
90
+ { matcher.class.name.demodulize.underscore => matcher.to_h }
44
91
  end
45
92
  end
46
93
  end
@@ -0,0 +1,28 @@
1
+ # lib/wayfarer/routing/hash_serialisable.rb
2
+ # frozen_string_literal: true
3
+
4
+ module Wayfarer
5
+ module Routing
6
+ module Serializable
7
+ def to_h
8
+ as_hash(self)
9
+ end
10
+
11
+ private
12
+
13
+ def as_hash(route)
14
+ {
15
+ matcher: matcher_name(route),
16
+ action: route.action(nil),
17
+ children: route.children.map { |child| as_hash(child) }
18
+ }.tap(&:compact!)
19
+ end
20
+
21
+ def matcher_name(route)
22
+ return nil unless route.matcher
23
+
24
+ route.matcher.class.name.split("::").last
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Routing
5
+ class SubRoute < Route
6
+ def evaluate(path_finder)
7
+ handle(
8
+ path_finder,
9
+ match: ->(_result) { true },
10
+ mismatch: ->(_result) { false }
11
+ )
12
+ end
13
+
14
+ def params(path_finder)
15
+ handle(
16
+ path_finder,
17
+ match: lambda(&:params),
18
+ mismatch: ->(_result) { EMPTY_PARAMS }
19
+ )
20
+ end
21
+
22
+ def action(path_finder)
23
+ handle(
24
+ path_finder,
25
+ match: lambda(&:action),
26
+ mismatch: ->(_result) {}
27
+ )
28
+ end
29
+
30
+ private
31
+
32
+ def handle(path_finder, match:, mismatch:)
33
+ case root = evaluate_matcher(path_finder)
34
+ when Wayfarer::Routing::RootRoute
35
+ case result = sub_result(root, path_finder)
36
+ when Wayfarer::Routing::Result::Match then match.call(result)
37
+ when Wayfarer::Routing::Result::Mismatch then mismatch.call(result)
38
+ else raise "invalid result: #{result.inspect}"
39
+ end
40
+ else raise "#{route.inspect} is not a root route"
41
+ end
42
+ end
43
+
44
+ def evaluate_matcher(path_finder)
45
+ path_finder[matcher] ||= matcher.evaluate(path_finder)
46
+ end
47
+
48
+ def sub_result(route, path_finder)
49
+ path_finder[route] ||= Wayfarer::Routing::PathFinder.sub_result(route, path_finder)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -2,6 +2,22 @@
2
2
 
3
3
  module Wayfarer
4
4
  module Routing
5
- class TargetRoute < Route; end
5
+ class TargetRoute < Route
6
+ def evaluate(_path_finder)
7
+ true
8
+ end
9
+
10
+ def target?
11
+ true
12
+ end
13
+
14
+ def to_h
15
+ { action: case @action
16
+ when Wayfarer::Handler then { class: @action.class.name }
17
+ when Array then { handler: @action.first.class.name, action: @action.second }
18
+ else @action
19
+ end }
20
+ end
21
+ end
6
22
  end
7
23
  end
@@ -1,46 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # 346325
4
-
5
3
  module Wayfarer
6
4
  module Stringify
7
- def self.included(base)
8
- base.include(InstanceMethods)
9
- base.extend(ClassMethods)
10
- base.instance_eval do
11
- cattr_accessor :stringified_attributes do
12
- []
13
- end
14
- end
5
+ extend ActiveSupport::Concern
6
+
7
+ included do
8
+ class_attribute :stringified_attributes,
9
+ default: [],
10
+ instance_accessor: false,
11
+ instance_predicate: false
12
+
13
+ alias_method :inspect, :to_s
15
14
  end
16
15
 
17
- module ClassMethods
16
+ class_methods do
18
17
  def stringify(*variables)
19
18
  stringified_attributes.concat(variables)
20
19
  end
21
20
  end
22
21
 
23
- module InstanceMethods
24
- def to_s
25
- if self.class.stringified_attributes.any?
26
- "#<#{class_name} #{attributes.join(', ')}>"
27
- else
28
- "#<#{class_name}>"
29
- end
30
- end
31
-
32
- alias inspect to_s
22
+ def to_s
23
+ class_name = self.class.name
33
24
 
34
- def class_name
35
- self.class.name
36
- end
25
+ if self.class.stringified_attributes.any?
26
+ attrs = self.class
27
+ .stringified_attributes
28
+ .to_h { |attr| [attr, public_send(attr)] }
29
+ .map { |k, v| "#{k}=#{v.inspect}" }
30
+ .join(", ")
37
31
 
38
- def attributes
39
- self.class
40
- .stringified_attributes
41
- .map { |attr| [attr, public_send(attr)] }
42
- .to_h
43
- .map { |k, v| [k, "=", v.inspect].join }
32
+ "#<#{class_name} #{attrs}>"
33
+ else
34
+ "#<#{class_name}>"
44
35
  end
45
36
  end
46
37
  end
data/lib/wayfarer/task.rb CHANGED
@@ -1,36 +1,28 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
+ # @!attribute [r] url
5
+ # @return [String] the URL to process
6
+ # @!attribute [r] batch
7
+ # @return [String] the batch the task belongs to
4
8
  class Task
9
+ include KV
5
10
  include Stringify
6
11
 
7
12
  attr_reader :url,
8
- :batch,
9
- :metadata
13
+ :batch
10
14
 
11
- stringify :url,
12
- :batch
15
+ stringify :url, :batch
13
16
 
17
+ # @!visibility private
14
18
  def initialize(url, batch)
15
19
  @url = url
16
20
  @batch = batch
17
- @metadata = OpenStruct.new
18
21
  end
19
22
 
23
+ # @!visibility private
20
24
  def ==(other)
21
25
  [url, batch] == [other.url, other.batch]
22
26
  end
23
-
24
- def barrier
25
- @barrier ||= Wayfarer::Redis::Barrier.new(batch)
26
- end
27
-
28
- def counter
29
- @counter ||= Wayfarer::Redis::Counter.new(batch)
30
- end
31
-
32
- def gc
33
- @gc ||= Wayfarer::GC.new(self)
34
- end
35
27
  end
36
28
  end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module URI
5
+ # HTTP(S) URL normalization.
6
+ module Normalization
7
+ InvalidURIError = Class.new(StandardError)
8
+
9
+ # Raised when URI is relative.
10
+ RelativeURIError = Class.new(InvalidURIError)
11
+
12
+ # Raised when URI scheme is not hypertext.
13
+ NoHypertextError = Class.new(InvalidURIError)
14
+
15
+ # Raised when URI has no host.
16
+ NoHostError = Class.new(InvalidURIError)
17
+
18
+ extend self
19
+
20
+ # Normalizes `uri` in-place.
21
+ # @param uri [Addressable::URI]
22
+ # @raise [InvalidURIError]
23
+ # @return [Addressable::URI]
24
+ def canonical!(uri)
25
+ had_no_path = uri.path.blank?
26
+
27
+ uri.normalize!
28
+ validate_uri!(uri)
29
+
30
+ normalize_host!(uri) if remove_www?
31
+
32
+ if remove_trailing_slash?
33
+ normalize_path!(uri)
34
+ root_path!(uri)
35
+ end
36
+
37
+ remove_fragment!(uri) if remove_fragment?
38
+ normalize_query_params!(uri)
39
+
40
+ root_path!(uri) if had_no_path && uri.query.nil?
41
+
42
+ uri
43
+ end
44
+
45
+ private
46
+
47
+ def validate_uri!(uri)
48
+ raise RelativeURIError, "URL is not absolute" unless uri.absolute?
49
+ raise NoHypertextError, "URL is using unsupported protocol" unless supported_protocols.include?(uri.scheme)
50
+ raise NoHostError, "URL misses hostname" if uri.host.blank?
51
+ end
52
+
53
+ def normalize_query_params!(uri)
54
+ return unless remove_tracking_parameters? || order_query_parameters?
55
+ return unless (params = uri.query_values(Array))
56
+
57
+ remove_tracking_parameters!(params) if remove_tracking_parameters?
58
+ order_query_parameters!(params) if order_query_parameters?
59
+
60
+ uri.query_values = params.empty? ? nil : params
61
+ end
62
+
63
+ def remove_tracking_parameters!(params)
64
+ params.reject! { |key, val| val.to_s.empty? || tracking_params.include?(key) }
65
+ end
66
+
67
+ def order_query_parameters!(params)
68
+ params.sort_by!(&:first)
69
+ end
70
+
71
+ def normalize_host!(uri)
72
+ uri.host &&= uri.host.delete_prefix("www.")
73
+ end
74
+
75
+ def normalize_path!(uri)
76
+ uri.path = uri.path.delete_suffix(File::SEPARATOR) if uri.path && uri.path.length > 1
77
+ end
78
+
79
+ def remove_fragment!(uri)
80
+ uri.fragment = nil
81
+ end
82
+
83
+ def root_path!(uri)
84
+ uri.path = "" if uri.path == File::SEPARATOR
85
+ end
86
+
87
+ def normalization_config
88
+ Wayfarer.config.fetch(:normalization)
89
+ end
90
+
91
+ def supported_protocols
92
+ normalization_config.fetch(:schemes)
93
+ end
94
+
95
+ def tracking_params
96
+ normalization_config.fetch(:tracking_params)
97
+ end
98
+
99
+ def remove_www?
100
+ normalization_config.fetch(:remove_www)
101
+ end
102
+
103
+ def remove_trailing_slash?
104
+ normalization_config.fetch(:remove_trailing_slash)
105
+ end
106
+
107
+ def remove_fragment?
108
+ normalization_config.fetch(:remove_fragment)
109
+ end
110
+
111
+ def remove_tracking_parameters?
112
+ normalization_config.fetch(:remove_tracking_parameters)
113
+ end
114
+
115
+ def order_query_parameters?
116
+ normalization_config.fetch(:order_query_parameters)
117
+ end
118
+ end
119
+ end
120
+ end
data/lib/wayfarer.rb CHANGED
@@ -3,13 +3,16 @@
3
3
  require "cgi"
4
4
  require "forwardable"
5
5
  require "net/http"
6
+ require "pp"
6
7
  require "securerandom"
7
8
  require "uri"
9
+ require "yaml"
8
10
 
9
11
  require "active_job"
12
+ require "active_support/core_ext/array/wrap"
13
+ require "active_support/core_ext/object/deep_dup"
10
14
  require "capybara"
11
15
  require "connection_pool"
12
- require "docile"
13
16
  require "ferrum"
14
17
  require "metainspector"
15
18
  require "mime/types"
@@ -18,7 +21,6 @@ require "mock_redis"
18
21
  require "mustermann"
19
22
  require "net/http/persistent"
20
23
  require "nokogiri"
21
- require "normalize_url"
22
24
  require "selenium-webdriver"
23
25
  require "redis"
24
26
  require "thor"
@@ -28,21 +30,84 @@ loader = Zeitwerk::Loader.for_gem
28
30
  loader.inflector.inflect("cli" => "CLI",
29
31
  "dsl" => "DSL",
30
32
  "http" => "HTTP",
33
+ "uri" => "URI",
31
34
  "url" => "URL",
32
35
  "xml" => "XML",
33
36
  "json" => "JSON",
34
- "gc" => "GC")
37
+ "gc" => "GC",
38
+ "kv" => "KV")
35
39
  loader.setup
36
40
 
37
41
  module Wayfarer
38
42
  module VERSION
39
43
  MAJOR = 0
40
44
  MINOR = 4
41
- TINY = 6
45
+ TINY = 8
42
46
  STRING = [MAJOR, MINOR, TINY].join(".")
43
47
  end
44
48
 
45
- mattr_accessor :config, default: Wayfarer::Config::Root.new
49
+ DEFAULT_CONFIG = {
50
+ redis: {
51
+ url: "redis://localhost:6379/0",
52
+ factory: ->(redis_config) { ::Redis.new(url: redis_config.fetch(:url)) }
53
+ },
54
+ network: {
55
+ agent: :http,
56
+ pool: {
57
+ size: 1,
58
+ timeout: 10
59
+ },
60
+ http_headers: {},
61
+ renew_on: []
62
+ },
63
+ parsing: {
64
+ registry: {
65
+ "application/json" => Wayfarer::Parsing::JSON,
66
+ "text/html" => [Wayfarer::Parsing::XML, :html],
67
+ "application/xml" => [Wayfarer::Parsing::XML, :xml]
68
+ }
69
+ },
70
+ normalization: {
71
+ remove_www: true,
72
+ remove_trailing_slash: true,
73
+ remove_fragment: true,
74
+ remove_tracking_parameters: true,
75
+ order_query_parameters: true,
76
+ schemes: %w[
77
+ http
78
+ https
79
+ ].to_set,
80
+ tracking_params: %w[
81
+ utm_source
82
+ utm_medium
83
+ utm_term
84
+ utm_content
85
+ utm_campaign
86
+ gclid
87
+ fbclid
88
+ msclkid
89
+ sms_ss
90
+ awesm
91
+ xtor
92
+ PHPSESSID
93
+ ].to_set
94
+ },
95
+ deduplication: {
96
+ key: ->(task) { task[:uri].to_s }
97
+ },
98
+ capybara: {
99
+ driver: nil
100
+ },
101
+ ferrum: {
102
+ options: {}
103
+ },
104
+ selenium: {
105
+ driver: :chrome,
106
+ options: {}
107
+ }
108
+ }.freeze
109
+
110
+ mattr_accessor :config, default: DEFAULT_CONFIG.deep_dup
46
111
 
47
112
  UserAgentTimeoutError = Class.new(StandardError) # TODO: Move to Networking namespace
48
113
  end
@@ -50,3 +115,5 @@ end
50
115
  loader.eager_load
51
116
 
52
117
  ActiveJob::Serializers.serializers << Wayfarer::Serializer
118
+
119
+ Wayfarer::BatchCompletion.subscribe!
data/mise.toml ADDED
@@ -0,0 +1,2 @@
1
+ [tools]
2
+ ruby = "3.4.4"
data/mkdocs.yml CHANGED
@@ -1,28 +1,40 @@
1
1
  repo_url: https://github.com/bauerd/wayfarer
2
2
  edit_uri: edit/develop/docs/
3
- repo_name: Source Code
3
+ repo_name: Code
4
4
  site_name: Wayfarer
5
5
  markdown_extensions:
6
6
  - admonition
7
+ - attr_list
7
8
  - meta
8
9
  - def_list
10
+ - pymdownx.snippets
9
11
  - pymdownx.details
10
12
  - pymdownx.highlight
11
- - pymdownx.superfences
13
+ - pymdownx.inlinehilite
14
+ - pymdownx.superfences:
15
+ custom_fences:
16
+ - name: mermaid
17
+ class: mermaid
18
+ format: !!python/name:pymdownx.superfences.fence_code_format
12
19
  - pymdownx.critic
13
20
  - pymdownx.caret
14
21
  - pymdownx.mark
15
22
  - pymdownx.tilde
16
- - pymdownx.tabbed
23
+ - pymdownx.tabbed:
24
+ alternate_style: true
17
25
  - pymdownx.tasklist:
18
26
  custom_checkbox: true
19
27
 
20
28
  theme:
29
+ icon:
30
+ logo: material/sign-direction
31
+ repo: fontawesome/brands/git
21
32
  name: material
22
33
  features:
23
34
  - navigation.tabs
24
- - navigation.sections
25
- - navigation.expand
35
+ # - navigation.sections
36
+ # - navigation.expand
37
+ - navigation.indexes
26
38
  font:
27
39
  text: IBM Plex Sans
28
40
  code: IBM Plex Sans Mono
@@ -41,7 +53,31 @@ theme:
41
53
  toggle:
42
54
  icon: material/lightbulb
43
55
  name: Switch to light mode
44
- icon:
45
- repo: fontawesome/brands/git
46
-
47
56
 
57
+ nav:
58
+ - Home: index.md
59
+ - Guides:
60
+ - Tutorial: guides/tutorial.md
61
+ - Jobs: guides/jobs.md
62
+ - Tasks: guides/tasks.md
63
+ - Pages: guides/pages.md
64
+ - Routing: guides/routing.md
65
+ - Callbacks: guides/callbacks.md
66
+ - Handlers: guides/handlers.md
67
+ - Configuration: guides/configuration.md
68
+ - Command-line interface: guides/cli.md
69
+ - Networking:
70
+ - Introduction: guides/user_agents.md
71
+ - User agent API: guides/networking/custom_adapters.md
72
+ - Built-in user agents:
73
+ - Plain HTTP: guides/networking/http.md
74
+ - Ferrum: guides/networking/ferrum.md
75
+ - Selenium: guides/networking/selenium.md
76
+ - Capybara: guides/networking/capybara.md
77
+ - Redis: guides/redis.md
78
+ - Development: guides/development.md
79
+ - Cookbook:
80
+ - Browser navigation: cookbook/navigation.md
81
+ - Executing JavaScript: cookbook/executing_javascript.md
82
+ - Screenhots: cookbook/screenshots.md
83
+ - API documentation: "https://www.rubydoc.info"
data/rake/docs.rake ADDED
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yard"
4
+
5
+ namespace :yard do
6
+ desc "Generate documentation"
7
+ YARD::Rake::YardocTask.new(:generate) do |t|
8
+ require_relative "../lib/wayfarer"
9
+
10
+ t.options = %w[--readme docs/index.md --no-private --markup markdown]
11
+ t.files = [
12
+ Wayfarer::Base,
13
+ Wayfarer::Task,
14
+ Wayfarer::Page,
15
+ Wayfarer::Handler,
16
+ Wayfarer::Routing::DSL,
17
+ Wayfarer::Parsing
18
+ ].freeze.map { |klass| Object.const_source_location(klass.name).first }
19
+ end
20
+
21
+ desc "Regenerate documentation on change"
22
+ task :watch do
23
+ # The output of `yard server` differs for some reason
24
+ sh "rerun --wait 0.5 --dir lib bundle exec rake yard:generate"
25
+ end
26
+ end
data/rake/lint.rake ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rubocop/rake_task"
4
+
5
+ namespace :lint do
6
+ RuboCop::RakeTask.new do |task|
7
+ task.formatters = %w[simple]
8
+ end
9
+ end