html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'faraday'
4
4
  require 'faraday/follow_redirects'
5
+ require 'faraday/gzip'
5
6
 
6
7
  module Html2rss
7
8
  class RequestService
@@ -9,15 +10,146 @@ module Html2rss
9
10
  # Strategy to use Faraday for the request.
10
11
  # @see https://rubygems.org/gems/faraday
11
12
  class FaradayStrategy < Strategy
12
- # return [Response]
13
+ ##
14
+ # Restores buffered streamed bytes so response middleware can process them.
15
+ class StreamingBodyMiddleware < Faraday::Middleware
16
+ # Request-context key used to store streamed chunks before middleware completion.
17
+ STREAM_BUFFER_KEY = :html2rss_stream_buffer
18
+
19
+ # @param env [Faraday::Env] completed response environment
20
+ # @return [void]
21
+ def on_complete(env)
22
+ buffer = env.request.context&.delete(STREAM_BUFFER_KEY)
23
+ return if buffer.nil? || buffer.empty?
24
+
25
+ env.body = buffer
26
+ end
27
+ end
28
+
29
+ ##
30
+ # Executes a request with runtime policy enforcement.
31
+ #
32
+ # @return [Response] normalized request response
33
+ # @note Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
34
+ # SSRF protection here is pre-connection only (DNS resolution via Policy).
35
+ # A DNS rebinding attack between resolution and connect cannot be caught at this layer.
13
36
  def execute
14
- request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
15
- faraday.use Faraday::FollowRedirects::Middleware
37
+ deadline = request_deadline
38
+ response_guard, response = perform_request(deadline:)
39
+ response_guard.inspect_body!(response.body)
40
+ build_response(response)
41
+ rescue Faraday::TimeoutError, Timeout::Error => error
42
+ raise RequestTimedOut, error.message
43
+ end
44
+
45
+ private
46
+
47
+ def request_deadline
48
+ monotonic_now + ctx.policy.total_timeout_seconds
49
+ end
50
+
51
+ def perform_request(deadline:)
52
+ response_guard = ResponseGuard.new(policy: ctx.policy)
53
+ response = faraday_request(response_guard, deadline:, streaming_buffer: true)
54
+ response = retry_without_streaming(response_guard, deadline:) if retry_without_streaming?(response)
55
+ [response_guard, response]
56
+ end
57
+
58
+ def build_response(response)
59
+ Response.new(body: response.body, headers: response.headers, url: response_url(response),
60
+ status: response.status)
61
+ end
62
+
63
+ def validate_request!(consume_budget: true)
64
+ ctx.budget.consume! if consume_budget
65
+ ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
66
+ end
67
+
68
+ def faraday_request(response_guard, deadline:, streaming_buffer:, consume_budget: true)
69
+ validate_request!(consume_budget:)
70
+
71
+ client.get do |req|
72
+ apply_timeouts(req, deadline:)
73
+ buffer = prepare_stream_buffer(req) if streaming_buffer
74
+ req.options.on_data = on_data_callback(response_guard, buffer)
75
+ end
76
+ end
77
+
78
+ def retry_without_streaming(response_guard, deadline:)
79
+ faraday_request(response_guard, deadline:, streaming_buffer: false, consume_budget: false)
80
+ end
81
+
82
+ def client
83
+ @client ||= Faraday.new(url: ctx.url.to_s, headers: ctx.headers) do |faraday|
84
+ faraday.use Faraday::FollowRedirects::Middleware, limit: ctx.policy.max_redirects, callback: redirect_callback
85
+ faraday.request :gzip
86
+ faraday.use StreamingBodyMiddleware
16
87
  faraday.adapter Faraday.default_adapter
17
88
  end
18
- response = request.get
89
+ end
90
+
91
+ def apply_timeouts(request, deadline:)
92
+ remaining_timeout = remaining_timeout_seconds(deadline)
93
+ request.options.timeout = remaining_timeout
94
+ request.options.open_timeout = [ctx.policy.connect_timeout_seconds, remaining_timeout].min
95
+ request.options.read_timeout = [ctx.policy.read_timeout_seconds, remaining_timeout].min
96
+ end
97
+
98
+ def prepare_stream_buffer(request)
99
+ request.options.context ||= {}
100
+ request.options.context[StreamingBodyMiddleware::STREAM_BUFFER_KEY] = +''
101
+ end
102
+
103
+ def on_data_callback(response_guard, buffer)
104
+ proc do |chunk, total_bytes, env|
105
+ response_guard.inspect_chunk!(total_bytes:, headers: env&.response_headers)
106
+ buffer&.<< chunk
107
+ end
108
+ end
109
+
110
+ def remaining_timeout_seconds(deadline)
111
+ remaining = deadline - monotonic_now
112
+ raise RequestTimedOut, 'Request timed out' if remaining <= 0
113
+
114
+ remaining
115
+ end
116
+
117
+ def retry_without_streaming?(response)
118
+ return false if response.body.to_s.empty? == false
119
+ return false unless response_success?(response)
120
+
121
+ final_url = response.env&.url
122
+ return false unless final_url
123
+
124
+ final_url.to_s != ctx.url.to_s
125
+ end
126
+
127
+ def response_success?(response)
128
+ return true if response.status.nil?
129
+
130
+ response.status >= 200 && response.status < 300
131
+ end
132
+
133
+ def response_url(response)
134
+ return ctx.url unless (url = response.env&.url)
135
+
136
+ Html2rss::Url.from_absolute(url.to_s)
137
+ end
138
+
139
+ def redirect_callback
140
+ lambda do |old_env, new_env|
141
+ from_url = normalize_url(old_env[:url])
142
+ to_url = normalize_url(new_env[:url])
143
+ ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
144
+ end
145
+ end
146
+
147
+ def normalize_url(url)
148
+ Html2rss::Url.from_absolute(url.to_s)
149
+ end
19
150
 
20
- Response.new(body: response.body, headers: response.headers)
151
+ def monotonic_now
152
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
153
  end
22
154
  end
23
155
  end
@@ -0,0 +1,252 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ipaddr'
4
+ require 'resolv'
5
+ require 'socket'
6
+
7
+ module Html2rss
8
+ class RequestService
9
+ ##
10
+ # Describes the runtime request envelope for a single feed build.
11
+ class Policy # rubocop:disable Metrics/ClassLength
12
+ MAX_REQUESTS_CEILING = 10
13
+ # Hostnames treated as local/private surfaces.
14
+ LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
15
+ # IP ranges blocked when private networks are disabled.
16
+ BLOCKED_IP_RANGES = [
17
+ IPAddr.new('0.0.0.0/8'),
18
+ IPAddr.new('10.0.0.0/8'),
19
+ IPAddr.new('127.0.0.0/8'),
20
+ IPAddr.new('169.254.0.0/16'),
21
+ IPAddr.new('172.16.0.0/12'),
22
+ IPAddr.new('192.168.0.0/16'),
23
+ IPAddr.new('224.0.0.0/4'),
24
+ IPAddr.new('::/128'),
25
+ IPAddr.new('::1/128'),
26
+ IPAddr.new('fe80::/10'),
27
+ IPAddr.new('fc00::/7'),
28
+ IPAddr.new('ff00::/8')
29
+ ].freeze
30
+
31
+ # Default policy values used when request controls are not explicitly set.
32
+ DEFAULTS = {
33
+ connect_timeout_seconds: 5,
34
+ read_timeout_seconds: 10,
35
+ total_timeout_seconds: 30,
36
+ max_redirects: 3,
37
+ max_response_bytes: 5_242_880,
38
+ max_decompressed_bytes: 10_485_760,
39
+ max_requests: 1,
40
+ allow_private_networks: false,
41
+ allow_cross_origin_followups: false
42
+ }.freeze
43
+
44
+ ##
45
+ # @param connect_timeout_seconds [Integer] maximum connection setup time
46
+ # @param read_timeout_seconds [Integer] maximum read stall time
47
+ # @param total_timeout_seconds [Integer] maximum total request time
48
+ # @param max_redirects [Integer] maximum redirect count
49
+ # @param max_response_bytes [Integer] maximum streamed response bytes
50
+ # @param max_decompressed_bytes [Integer] maximum final body size
51
+ # @param max_requests [Integer] maximum requests per feed build
52
+ # @param allow_private_networks [Boolean] whether private network targets are allowed
53
+ # @param allow_cross_origin_followups [Boolean] whether follow-up requests may leave the origin host
54
+ # @param resolver [#each_address] DNS resolver used for hostname classification
55
+ def initialize(connect_timeout_seconds: DEFAULTS[:connect_timeout_seconds], # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
56
+ read_timeout_seconds: DEFAULTS[:read_timeout_seconds],
57
+ total_timeout_seconds: DEFAULTS[:total_timeout_seconds],
58
+ max_redirects: DEFAULTS[:max_redirects],
59
+ max_response_bytes: DEFAULTS[:max_response_bytes],
60
+ max_decompressed_bytes: DEFAULTS[:max_decompressed_bytes],
61
+ max_requests: DEFAULTS[:max_requests],
62
+ allow_private_networks: DEFAULTS[:allow_private_networks],
63
+ allow_cross_origin_followups: DEFAULTS[:allow_cross_origin_followups],
64
+ resolver: Socket)
65
+ @connect_timeout_seconds = validate_positive_integer!(:connect_timeout_seconds, connect_timeout_seconds)
66
+ @read_timeout_seconds = validate_positive_integer!(:read_timeout_seconds, read_timeout_seconds)
67
+ @total_timeout_seconds = validate_positive_integer!(:total_timeout_seconds, total_timeout_seconds)
68
+ @max_redirects = validate_non_negative_integer!(:max_redirects, max_redirects)
69
+ @max_response_bytes = validate_positive_integer!(:max_response_bytes, max_response_bytes)
70
+ @max_decompressed_bytes = validate_positive_integer!(:max_decompressed_bytes, max_decompressed_bytes)
71
+ @max_requests = [validate_positive_integer!(:max_requests, max_requests), MAX_REQUESTS_CEILING].min
72
+ @allow_private_networks = allow_private_networks ? true : false
73
+ @allow_cross_origin_followups = allow_cross_origin_followups ? true : false
74
+ @resolver = resolver
75
+ freeze
76
+ end
77
+
78
+ attr_reader :connect_timeout_seconds,
79
+ :read_timeout_seconds,
80
+ :total_timeout_seconds,
81
+ :max_redirects,
82
+ :max_response_bytes,
83
+ :max_decompressed_bytes,
84
+ :max_requests
85
+
86
+ ##
87
+ # @return [Boolean] whether private network targets may be requested
88
+ def allow_private_networks?
89
+ @allow_private_networks
90
+ end
91
+
92
+ ##
93
+ # @return [Boolean] whether follow-up requests may leave the initial origin
94
+ def allow_cross_origin_followups?
95
+ @allow_cross_origin_followups
96
+ end
97
+
98
+ ##
99
+ # Returns the default request policy.
100
+ #
101
+ # @return [Policy] a default, frozen policy instance
102
+ # rubocop:disable Layout/ClassStructure
103
+ def self.default
104
+ new
105
+ end
106
+ # rubocop:enable Layout/ClassStructure
107
+
108
+ ##
109
+ # Validates whether a request target is permitted for the given context.
110
+ #
111
+ # @param url [Html2rss::Url] destination URL
112
+ # @param origin_url [Html2rss::Url] initial URL of the feed build
113
+ # @param relation [Symbol] logical reason for the request
114
+ # @return [void]
115
+ # @raise [CrossOriginFollowUpDenied] if a follow-up leaves the origin host
116
+ # @raise [PrivateNetworkDenied] if the target resolves to a private address
117
+ def validate_request!(url:, origin_url:, relation:)
118
+ enforce_same_origin!(url, origin_url, relation)
119
+ enforce_public_network!(url)
120
+ end
121
+
122
+ ##
123
+ # Validates a redirect hop before it is followed.
124
+ #
125
+ # @param from_url [Html2rss::Url] URL that produced the redirect
126
+ # @param to_url [Html2rss::Url] redirect destination
127
+ # @param origin_url [Html2rss::Url] initial URL of the feed build
128
+ # @param relation [Symbol] logical reason for the request
129
+ # @return [void]
130
+ # @raise [UnsupportedUrlScheme] if the redirect downgrades from HTTPS to HTTP
131
+ def validate_redirect!(from_url:, to_url:, origin_url:, relation:)
132
+ if from_url.scheme == 'https' && to_url.scheme == 'http'
133
+ raise UnsupportedUrlScheme, 'Redirect downgraded from https to http'
134
+ end
135
+
136
+ validate_request!(url: to_url, origin_url:, relation:)
137
+ end
138
+
139
+ ##
140
+ # Validates the resolved remote IP for a completed request.
141
+ #
142
+ # @param ip [String, nil] remote IP address reported by the client
143
+ # @param url [Html2rss::Url] URL associated with the response
144
+ # @return [void]
145
+ # @raise [PrivateNetworkDenied] if the response came from a blocked address
146
+ def validate_remote_ip!(ip:, url:)
147
+ return if allow_private_networks?
148
+ return if ip.nil? || ip.empty?
149
+
150
+ parsed_ip = parse_ip(ip)
151
+ raise PrivateNetworkDenied, "Remote IP could not be validated for #{url}" unless parsed_ip
152
+ return unless blocked_ip?(parsed_ip)
153
+
154
+ raise PrivateNetworkDenied, "Private network target denied for #{url}"
155
+ end
156
+
157
+ private
158
+
159
+ attr_reader :resolver
160
+
161
+ def validate_positive_integer!(name, value)
162
+ raise ArgumentError, "#{name} must be positive" unless value.is_a?(Integer) && value.positive?
163
+
164
+ value
165
+ end
166
+
167
+ def validate_non_negative_integer!(name, value)
168
+ raise ArgumentError, "#{name} must be non-negative" unless value.is_a?(Integer) && !value.negative?
169
+
170
+ value
171
+ end
172
+
173
+ def enforce_same_origin!(url, origin_url, relation)
174
+ return if relation == :initial || allow_cross_origin_followups?
175
+
176
+ enforce_follow_up_scheme!(url, origin_url)
177
+ return if comparable_origin(url) == comparable_origin(origin_url)
178
+
179
+ raise CrossOriginFollowUpDenied, "Cross-origin follow-up denied for #{url}"
180
+ end
181
+
182
+ def enforce_follow_up_scheme!(url, origin_url)
183
+ return unless origin_url.scheme == 'https' && url.scheme == 'http'
184
+
185
+ raise UnsupportedUrlScheme, "Follow-up downgraded from https to http for #{url}"
186
+ end
187
+
188
+ def comparable_origin(url)
189
+ [url.host, normalized_port(url)]
190
+ end
191
+
192
+ def normalized_port(url)
193
+ return url.port if url.port
194
+
195
+ url.scheme == 'https' ? 443 : 80
196
+ end
197
+
198
+ def enforce_public_network!(url)
199
+ host = url.host
200
+ return if allow_private_networks?
201
+ return unless blocked_host?(host) || resolved_ip_addresses(host).any? { |address| blocked_ip?(address) }
202
+
203
+ raise PrivateNetworkDenied, "Private network target denied for #{url}"
204
+ end
205
+
206
+ def blocked_host?(host)
207
+ LOCAL_HOSTS.include?(host.to_s.downcase)
208
+ end
209
+
210
+ def resolved_ip_addresses(host)
211
+ literal = parse_ip(host)
212
+ return [literal] if literal
213
+
214
+ if resolver.respond_to?(:each_address)
215
+ addresses_from_each_address(host)
216
+ else
217
+ addresses_from_getaddrinfo(host)
218
+ end
219
+ rescue Resolv::ResolvError, SocketError, SystemCallError
220
+ []
221
+ end
222
+
223
+ def addresses_from_each_address(host)
224
+ [].tap do |addresses|
225
+ resolver.each_address(host) do |address|
226
+ parsed = parse_ip(address)
227
+ addresses << parsed if parsed
228
+ end
229
+ end
230
+ end
231
+
232
+ def addresses_from_getaddrinfo(host)
233
+ resolver.getaddrinfo(host, nil).filter_map do |entry|
234
+ parse_ip(entry[3])
235
+ end
236
+ end
237
+
238
+ def parse_ip(value)
239
+ IPAddr.new(value)
240
+ rescue IPAddr::AddressFamilyError, IPAddr::InvalidAddressError
241
+ nil
242
+ end
243
+
244
+ def blocked_ip?(address)
245
+ BLOCKED_IP_RANGES.any? { |range| range.include?(address) }
246
+ end
247
+ end
248
+
249
+ # Shared immutable policy instance used for default request execution.
250
+ Policy::DEFAULT_POLICY = Policy.new
251
+ end
252
+ end
@@ -4,7 +4,13 @@ module Html2rss
4
4
  class RequestService
5
5
  ##
6
6
  # Commands the Puppeteer Browser to the website and builds the Response.
7
- class PuppetCommander
7
+ class PuppetCommander # rubocop:disable Metrics/ClassLength
8
+ BROWSER_UNSAFE_HEADERS = %w[
9
+ host connection content-length transfer-encoding
10
+ sec-fetch-dest sec-fetch-mode sec-fetch-site sec-fetch-user
11
+ upgrade-insecure-requests
12
+ ].to_set.freeze
13
+
8
14
  # @param ctx [Context]
9
15
  # @param browser [Puppeteer::Browser]
10
16
  # @param skip_request_resources [Set<String>] the resource types not to request
@@ -19,13 +25,18 @@ module Html2rss
19
25
  @referer = referer
20
26
  end
21
27
 
22
- # @return [Response]
28
+ ##
29
+ # Visits the request URL and normalizes the page into a response object.
30
+ #
31
+ # @return [Response] rendered page response
23
32
  def call
24
33
  page = new_page
25
-
26
- response = navigate_to_destination(page, ctx.url)
27
-
28
- Response.new(body: body(page), headers: response.headers)
34
+ navigation_response = navigate_to_destination(page, ctx.url)
35
+ perform_preload(page)
36
+ raise_navigation_error_if_any
37
+ final_navigation_response = latest_navigation_response || navigation_response
38
+ validate_navigation_response!(final_navigation_response)
39
+ build_response(page, final_navigation_response)
29
40
  ensure
30
41
  page&.close
31
42
  end
@@ -35,27 +46,215 @@ module Html2rss
35
46
  # @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
36
47
  def new_page
37
48
  page = browser.new_page
38
- page.extra_http_headers = ctx.headers
49
+ @main_frame = page.main_frame if page.respond_to?(:main_frame)
50
+ configure_page(page)
51
+ configure_navigation_guards(page)
52
+ page
53
+ end
39
54
 
40
- return page if skip_request_resources.empty?
55
+ ##
56
+ # @param page [Puppeteer::Page]
57
+ # @return [void]
58
+ def configure_page(page)
59
+ page.extra_http_headers = browser_headers
60
+ page.default_navigation_timeout = navigation_timeout_ms
61
+ page.default_timeout = navigation_timeout_ms
62
+ end
41
63
 
64
+ ##
65
+ # @param page [Puppeteer::Page]
66
+ # @return [void]
67
+ def configure_navigation_guards(page)
42
68
  page.request_interception = true
43
69
  page.on('request') do |request|
44
- skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
70
+ handle_request(request)
45
71
  end
46
-
47
- page
72
+ page.on('response') { |response| handle_response(response) }
48
73
  end
49
74
 
75
+ ##
76
+ # @param page [Puppeteer::Page] browser page
77
+ # @param url [Html2rss::Url] target URL
78
+ # @return [Puppeteer::HTTPResponse, nil] the navigation response if one was produced
50
79
  def navigate_to_destination(page, url)
51
- page.goto(url, wait_until: 'networkidle0', referer:)
80
+ @navigation_error = nil
81
+ @latest_navigation_response = nil
82
+ page.goto(url, wait_until: 'networkidle0', referer:, timeout: navigation_timeout_ms).tap do
83
+ raise_navigation_error_if_any
84
+ end
85
+ rescue StandardError
86
+ raise_navigation_error_if_any
87
+
88
+ raise
52
89
  end
53
90
 
91
+ ##
92
+ # @param page [Puppeteer::Page] browser page
93
+ # @return [String] rendered HTML content
54
94
  def body(page) = page.content
55
95
 
56
96
  private
57
97
 
58
- attr_reader :ctx, :browser, :skip_request_resources, :referer
98
+ attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
99
+
100
+ def raise_navigation_error_if_any
101
+ raise @navigation_error if @navigation_error
102
+ end
103
+
104
+ def navigation_timeout_ms
105
+ ctx.policy.total_timeout_seconds * 1000
106
+ end
107
+
108
+ def browser_headers
109
+ ctx.headers.reject { |key, _| BROWSER_UNSAFE_HEADERS.include?(key.to_s.downcase) }
110
+ end
111
+
112
+ def handle_request(request)
113
+ validate_request!(request)
114
+
115
+ skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
116
+ rescue Html2rss::Error => error
117
+ store_navigation_error(error, navigation_request: request.navigation_request?)
118
+ request.abort
119
+ end
120
+
121
+ def handle_response(response)
122
+ @latest_navigation_response = response if main_frame_navigation_response?(response)
123
+ validate_response!(response)
124
+ rescue Html2rss::Error => error
125
+ store_navigation_error(error, navigation_request: response.request.navigation_request?)
126
+ end
127
+
128
+ def validate_request!(request)
129
+ validate_navigation_redirect_chain!(request)
130
+ validate_navigation_target!(request)
131
+ end
132
+
133
+ def main_frame_navigation_response?(response)
134
+ request = response.request
135
+ return false unless request.navigation_request?
136
+ return true unless request.respond_to?(:frame)
137
+
138
+ frame = request.frame
139
+ return true if frame.nil?
140
+ return frame == main_frame unless main_frame.nil?
141
+ return true unless frame.respond_to?(:parent_frame)
142
+
143
+ frame.parent_frame.nil?
144
+ end
145
+
146
+ def build_response(page, navigation_response)
147
+ page_body = body(page)
148
+ ResponseGuard.new(policy: ctx.policy).inspect_body!(page_body)
149
+
150
+ Response.new(
151
+ body: page_body,
152
+ headers: navigation_response&.headers || {},
153
+ url: response_url(navigation_response, ctx.url),
154
+ status: navigation_response&.status
155
+ )
156
+ end
157
+
158
+ def validate_navigation_response!(navigation_response)
159
+ final_url = response_url(navigation_response, ctx.url)
160
+ ctx.policy.validate_remote_ip!(ip: remote_ip(navigation_response), url: final_url)
161
+ end
162
+
163
+ def validate_response!(response)
164
+ validate_navigation_response!(response)
165
+ end
166
+
167
+ def response_url(navigation_response, fallback_url)
168
+ raw_url = navigation_response&.url || fallback_url.to_s
169
+ Html2rss::Url.from_absolute(raw_url)
170
+ end
171
+
172
+ def remote_ip(navigation_response)
173
+ navigation_response.remote_address&.ip
174
+ end
175
+
176
+ def request_chain(request)
177
+ (request.redirect_chain + [request]).map { |entry| request_url(entry) }
178
+ end
179
+
180
+ def request_url(request)
181
+ Html2rss::Url.from_absolute(request.url)
182
+ end
183
+
184
+ def validate_navigation_redirect_chain!(request)
185
+ request_chain(request).each_cons(2) do |from_url, to_url|
186
+ ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
187
+ end
188
+ end
189
+
190
+ def validate_navigation_target!(request)
191
+ ctx.policy.validate_request!(url: request_url(request), origin_url: ctx.origin_url, relation: ctx.relation)
192
+ end
193
+
194
+ def store_navigation_error(error, navigation_request:)
195
+ return unless navigation_request
196
+
197
+ @navigation_error = error if @navigation_error.nil?
198
+ end
199
+
200
+ def perform_preload(page)
201
+ preload_config = ctx.browserless_preload
202
+ return unless preload_config
203
+
204
+ wait_after(page, preload_config[:wait_after_ms])
205
+ click_selectors(page, preload_config[:click_selectors]) if preload_config[:click_selectors]
206
+ scroll_down(page, preload_config[:scroll_down]) if preload_config[:scroll_down]
207
+ wait_after(page, preload_config[:wait_after_ms])
208
+ end
209
+
210
+ def wait_after(page, timeout_ms)
211
+ return unless timeout_ms
212
+
213
+ ctx.budget.consume!
214
+ page.wait_for_timeout(timeout_ms)
215
+ end
216
+
217
+ def click_selectors(page, selectors)
218
+ selectors.each { |selector_config| click_selector(page, selector_config) }
219
+ end
220
+
221
+ def scroll_down(page, config)
222
+ iterations = config.fetch(:iterations, 1)
223
+ wait_after_ms = config[:wait_after_ms]
224
+ previous_height = nil
225
+
226
+ iterations.times do
227
+ updated_height = perform_scroll_iteration(page, wait_after_ms, previous_height)
228
+ break unless updated_height
229
+
230
+ previous_height = updated_height
231
+ end
232
+ end
233
+
234
+ def click_selector(page, config)
235
+ selector = config.fetch(:selector)
236
+ max_clicks = config.fetch(:max_clicks, 1)
237
+ wait_after_ms = config[:wait_after_ms]
238
+
239
+ max_clicks.times do
240
+ break unless (element = page.query_selector(selector))
241
+
242
+ ctx.budget.consume!
243
+ element.click
244
+ wait_after(page, wait_after_ms)
245
+ end
246
+ end
247
+
248
+ def perform_scroll_iteration(page, wait_after_ms, previous_height)
249
+ ctx.budget.consume!
250
+ page.evaluate('() => window.scrollTo(0, document.body.scrollHeight)')
251
+ wait_after(page, wait_after_ms)
252
+
253
+ current_height = page.evaluate('() => document.body.scrollHeight')
254
+ return if previous_height && current_height <= previous_height
255
+
256
+ current_height
257
+ end
59
258
  end
60
259
  end
61
260
  end