html2rss 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +113 -44
  3. data/html2rss.gemspec +3 -2
  4. data/lib/html2rss/auto_source/article.rb +37 -5
  5. data/lib/html2rss/auto_source/channel.rb +21 -28
  6. data/lib/html2rss/auto_source/cleanup.rb +0 -16
  7. data/lib/html2rss/auto_source/rss_builder.rb +1 -1
  8. data/lib/html2rss/auto_source/scraper/html.rb +96 -0
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
  13. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +51 -38
  14. data/lib/html2rss/auto_source/scraper.rb +1 -0
  15. data/lib/html2rss/auto_source.rb +0 -7
  16. data/lib/html2rss/cli.rb +11 -4
  17. data/lib/html2rss/config/channel.rb +7 -1
  18. data/lib/html2rss/config/selectors.rb +2 -1
  19. data/lib/html2rss/config.rb +1 -0
  20. data/lib/html2rss/item.rb +7 -2
  21. data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
  22. data/lib/html2rss/request_service/context.rb +46 -0
  23. data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
  24. data/lib/html2rss/request_service/puppet_commander.rb +61 -0
  25. data/lib/html2rss/request_service/response.rb +27 -0
  26. data/lib/html2rss/request_service/strategy.rb +28 -0
  27. data/lib/html2rss/request_service.rb +97 -0
  28. data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
  29. data/lib/html2rss/utils.rb +23 -26
  30. data/lib/html2rss/version.rb +1 -1
  31. data/lib/html2rss.rb +7 -6
  32. metadata +35 -11
  33. data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class Schema
9
+ ##
10
+ # A Thing is kind of the 'base class' for Schema.org schema_objects.
11
+ #
12
+ # @see https://schema.org/Thing
13
+ class Thing
14
+ SUPPORTED_TYPES = %w[
15
+ AdvertiserContentArticle
16
+ AnalysisNewsArticle
17
+ APIReference
18
+ Article
19
+ AskPublicNewsArticle
20
+ BackgroundNewsArticle
21
+ BlogPosting
22
+ DiscussionForumPosting
23
+ LiveBlogPosting
24
+ NewsArticle
25
+ OpinionNewsArticle
26
+ Report
27
+ ReportageNewsArticle
28
+ ReviewNewsArticle
29
+ SatiricalArticle
30
+ ScholarlyArticle
31
+ SocialMediaPosting
32
+ TechArticle
33
+ ].to_set.freeze
34
+
35
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
36
+
37
+ def initialize(schema_object, url:)
38
+ @schema_object = schema_object
39
+ @url = url
40
+ end
41
+
42
+ # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
43
+ def call
44
+ DEFAULT_ATTRIBUTES.to_h do |attribute|
45
+ [attribute, public_send(attribute)]
46
+ end
47
+ end
48
+
49
+ def id
50
+ return @id if defined?(@id)
51
+
52
+ id = (schema_object[:@id] || url&.path).to_s
53
+
54
+ return if id.empty?
55
+
56
+ @id = id
57
+ end
58
+
59
+ def title = schema_object[:title]
60
+
61
+ def description
62
+ schema_object.values_at(:description, :schema_object_body, :abstract)
63
+ .max_by { |string| string.to_s.size }
64
+ end
65
+
66
+ # @return [Addressable::URI, nil] the URL of the schema object
67
+ def url
68
+ url = schema_object[:url]
69
+ if url.to_s.empty?
70
+ Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
71
+ return
72
+ end
73
+
74
+ Utils.build_absolute_url_from_relative(url, @url)
75
+ end
76
+
77
+ def image
78
+ if (image_url = image_urls.first)
79
+ Utils.build_absolute_url_from_relative(image_url, @url)
80
+ end
81
+ end
82
+
83
+ def published_at = schema_object[:datePublished]
84
+
85
+ private
86
+
87
+ attr_reader :schema_object
88
+
89
+ def image_urls
90
+ schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
91
+ next unless object
92
+
93
+ if object.is_a?(String)
94
+ object
95
+ elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
96
+ object[:url] || object[:contentUrl]
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -8,52 +8,33 @@ module Html2rss
8
8
  class AutoSource
9
9
  module Scraper
10
10
  ##
11
- # Scraps articles from Schema.org objects, by looking for the objects in:
11
+ # Scrapes articles from Schema.org objects, by looking for the objects in:
12
12
 
13
- # 1. <script type="application/ld+json"> "schema" tag.
14
- # 2. tbd
13
+ # <script type="application/ld+json"> "schema" tags.
15
14
  #
16
15
  # See:
17
- # 1. https://schema.org/NewsArticle
16
+ # 1. https://schema.org/docs/full.html
18
17
  # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
19
18
  class Schema
20
19
  include Enumerable
21
20
 
22
21
  TAG_SELECTOR = 'script[type="application/ld+json"]'
23
- SCHEMA_OBJECT_TYPES = %w[
24
- AdvertiserContentArticle
25
- AnalysisNewsArticle
26
- APIReference
27
- Article
28
- AskPublicNewsArticle
29
- BackgroundNewsArticle
30
- BlogPosting
31
- DiscussionForumPosting
32
- LiveBlogPosting
33
- NewsArticle
34
- OpinionNewsArticle
35
- Report
36
- ReportageNewsArticle
37
- ReviewNewsArticle
38
- SatiricalArticle
39
- ScholarlyArticle
40
- SocialMediaPosting
41
- TechArticle
42
- ].to_set.freeze
43
22
 
44
23
  class << self
45
24
  def articles?(parsed_body)
46
25
  parsed_body.css(TAG_SELECTOR).any? do |script|
47
- SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
26
+ (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
27
+ script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
28
+ end
48
29
  end
49
30
  end
50
31
 
51
32
  ##
52
33
  # Returns a flat array
53
34
  # of all supported schema objects
54
- # by recursively traversing the `from` object.
35
+ # by recursively traversing the given `object`.
55
36
  #
56
- # @param object [Hash, Array]
37
+ # @param object [Hash, Array, Nokogiri::XML::Element]
57
38
  # @return [Array<Hash>] the schema_objects, or an empty array
58
39
  # :reek:DuplicateMethodCall
59
40
  def from(object)
@@ -74,12 +55,16 @@ module Html2rss
74
55
  end
75
56
 
76
57
  ##
77
- # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
58
+ # @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
78
59
  def scraper_for_schema_object(schema_object)
79
- if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
80
- Base
60
+ type = schema_object[:@type]
61
+
62
+ if Thing::SUPPORTED_TYPES.member?(type)
63
+ Thing
64
+ elsif ItemList::SUPPORTED_TYPES.member?(type)
65
+ ItemList
81
66
  else
82
- Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
67
+ Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
83
68
  nil
84
69
  end
85
70
  end
@@ -107,9 +92,13 @@ module Html2rss
107
92
 
108
93
  schema_objects.filter_map do |schema_object|
109
94
  next unless (klass = self.class.scraper_for_schema_object(schema_object))
110
- next unless (article_hash = klass.new(schema_object, url:).call)
95
+ next unless (results = klass.new(schema_object, url:).call)
111
96
 
112
- yield article_hash
97
+ if results.is_a?(Array)
98
+ results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
99
+ else
100
+ yield(results)
101
+ end
113
102
  end
114
103
  end
115
104
 
@@ -35,13 +35,13 @@ module Html2rss
35
35
  def initialize(article_tag, url:)
36
36
  @article_tag = article_tag
37
37
  @url = url
38
- @heading = find_heading
39
- @extract_url = find_url
40
38
  end
41
39
 
42
40
  # @return [Hash, nil] The scraped article or nil.
43
41
  def call
44
- return unless heading
42
+ @heading = find_heading || closest_anchor || return
43
+
44
+ @extract_url = find_url
45
45
 
46
46
  {
47
47
  title: extract_title,
@@ -57,35 +57,45 @@ module Html2rss
57
57
 
58
58
  attr_reader :article_tag, :url, :heading, :extract_url
59
59
 
60
- def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
60
+ ##
61
+ # Find the heading of the article.
62
+ # @return [Nokogiri::XML::Node, nil]
63
+ def find_heading
64
+ heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
61
65
 
62
- # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
63
- def extract_published_at
64
- times = article_tag.css('time[datetime]')
65
- .filter_map do |tag|
66
- DateTime.parse(tag['datetime'])
67
- rescue ArgumentError, TypeError
68
- nil
69
- end
66
+ return if heading_tags.empty?
70
67
 
71
- times.min
68
+ smallest_heading = heading_tags.keys.min
69
+ heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size.to_i }
72
70
  end
73
71
 
74
- def find_heading
75
- heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
76
- smallest_heading = heading_tags.keys.min
77
- heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
72
+ def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
73
+
74
+ def closest_anchor
75
+ SemanticHtml.find_closest_selector(heading || article_tag,
76
+ selector: 'a[href]:not([href=""])')
77
+ end
78
+
79
+ def find_url
80
+ href = closest_anchor&.[]('href')
81
+
82
+ return if (parts = href.to_s.split('#')).empty?
83
+
84
+ Utils.build_absolute_url_from_relative(parts.first.strip, url)
78
85
  end
79
86
 
80
87
  def extract_title
81
- @extract_title ||= if heading.children.empty? && heading.text
82
- visible_text_from_tag(heading)
83
- else
84
- visible_text_from_tag(
85
- article_tag.css(HEADING_TAGS.join(','))
86
- .max_by { |tag| tag.text.size }
87
- )
88
- end
88
+ if heading && (heading.children.empty? || heading.text)
89
+ visible_text_from_tag(heading)
90
+ else
91
+ visible_text_from_tag(article_tag.css(HEADING_TAGS.join(','))
92
+ .max_by { |tag| tag.text.size })
93
+
94
+ end
95
+ end
96
+
97
+ def extract_image
98
+ Image.call(article_tag, url:)
89
99
  end
90
100
 
91
101
  def extract_description
@@ -95,26 +105,29 @@ module Html2rss
95
105
  description = visible_text_from_tag(article_tag)
96
106
  return nil unless description
97
107
 
98
- title_text = extract_title
99
- description.gsub!(title_text, '') if title_text
100
108
  description.strip!
101
109
  description.empty? ? nil : description
102
110
  end
103
111
 
104
- def find_url
105
- closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
106
- selector: 'a[href]:not([href=""])')
107
- href = closest_anchor&.[]('href')&.split('#')&.first&.strip
108
- Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
112
+ def generate_id
113
+ [
114
+ article_tag['id'],
115
+ article_tag.at_css('[id]')&.attr('id'),
116
+ extract_url&.path,
117
+ extract_url&.query
118
+ ].compact.reject(&:empty?).first
109
119
  end
110
120
 
111
- def extract_image
112
- Image.call(article_tag, url:)
113
- end
121
+ # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
122
+ def extract_published_at
123
+ times = article_tag.css('time[datetime]')
124
+ .filter_map do |tag|
125
+ DateTime.parse(tag['datetime'])
126
+ rescue ArgumentError, TypeError
127
+ nil
128
+ end
114
129
 
115
- def generate_id
116
- [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
117
- extract_url&.path].compact.reject(&:empty?).first
130
+ times.min
118
131
  end
119
132
  end
120
133
  end
@@ -10,6 +10,7 @@ module Html2rss
10
10
  #
11
11
  module Scraper
12
12
  SCRAPERS = [
13
+ Html,
13
14
  Schema,
14
15
  SemanticHtml
15
16
  ].freeze
@@ -11,20 +11,13 @@ module Html2rss
11
11
  # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
12
  # marking articles, e.g. schema, microdata, open graph, etc.
13
13
  class AutoSource
14
- class UnsupportedUrlScheme < Html2rss::Error; end
15
14
  class NoArticlesFound < Html2rss::Error; end
16
15
 
17
- SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
-
19
16
  ##
20
17
  # @param url [Addressable::URI] The URL to extract articles from.
21
18
  # @param body [String] The body of the response.
22
19
  # @param headers [Hash] The headers of the response.
23
20
  def initialize(url, body:, headers: {})
24
- raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
25
- raise ArgumentError, 'URL must be absolute' unless url.absolute?
26
- raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
27
-
28
21
  @url = url
29
22
  @body = body
30
23
  @headers = headers
data/lib/html2rss/cli.rb CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  require_relative '../html2rss'
4
4
  require 'thor'
5
- require 'addressable'
6
5
 
7
6
  ##
8
7
  # The Html2rss namespace / command line interface.
@@ -26,14 +25,22 @@ module Html2rss
26
25
  def feed(yaml_file, *options)
27
26
  raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
28
27
 
29
- feed_name = options.shift
28
+ feed_name = options.shift unless options.first&.include?('=')
30
29
  params = options.to_h { |opt| opt.split('=', 2) }
30
+
31
31
  puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
32
32
  end
33
33
 
34
- desc 'auto URL', 'automatically sources an RSS feed from the URL'
34
+ desc 'auto URL', 'Automatically sources an RSS feed from the URL'
35
+ method_option :strategy,
36
+ type: :string,
37
+ desc: 'The strategy to request the URL',
38
+ enum: RequestService.strategy_names,
39
+ default: RequestService.default_strategy_name
35
40
  def auto(url)
36
- puts Html2rss.auto_source(url)
41
+ strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
42
+
43
+ puts Html2rss.auto_source(url, strategy:)
37
44
  end
38
45
  end
39
46
  end
@@ -55,7 +55,7 @@ module Html2rss
55
55
  ##
56
56
  # @return [String]
57
57
  def title
58
- config.fetch(:title) { Utils.titleized_url(url) }
58
+ config.fetch(:title) { Utils.titleized_channel_url(url) }
59
59
  end
60
60
 
61
61
  ##
@@ -88,6 +88,12 @@ module Html2rss
88
88
  config.fetch(:json, false)
89
89
  end
90
90
 
91
+ ##
92
+ # @return [Symbol]
93
+ def strategy
94
+ config.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
95
+ end
96
+
91
97
  private
92
98
 
93
99
  # @return [Hash<Symbol, Object>]
@@ -8,7 +8,8 @@ module Html2rss
8
8
  ITEMS_SELECTOR_NAME = :items
9
9
 
10
10
  # Struct to represent a selector with associated attributes for extraction and processing.
11
- Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
11
+ Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
12
+ keyword_init: true)
12
13
 
13
14
  # raised when an invalid selector name is used
14
15
  class InvalidSelectorName < Html2rss::Error; end
@@ -27,6 +27,7 @@ module Html2rss
27
27
  def_delegator :@channel, :url, :link
28
28
  def_delegator :@channel, :time_zone
29
29
  def_delegator :@channel, :json?
30
+ def_delegator :@channel, :strategy
30
31
 
31
32
  def_delegator :@selectors, :item_selector_names
32
33
  def_delegator :@selectors, :selector?
data/lib/html2rss/item.rb CHANGED
@@ -23,7 +23,9 @@ module Html2rss
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
26
- body = Utils.request_url(url, headers: config.headers).body
26
+ ctx = RequestService::Context.new(url:, headers: config.headers)
27
+
28
+ body = RequestService.execute(ctx, strategy: config.strategy).body
27
29
  body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
28
30
 
29
31
  Nokogiri.HTML(body)
@@ -136,8 +138,11 @@ module Html2rss
136
138
 
137
139
  raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
138
140
 
141
+ type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
142
+ Html2rss::Utils.guess_content_type_from_url(url)
143
+
139
144
  Enclosure.new(
140
- type: Html2rss::Utils.guess_content_type_from_url(url),
145
+ type:,
141
146
  bits_length: 0,
142
147
  url: url.to_s
143
148
  )
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'puppeteer'
4
+
5
+ module Html2rss
6
+ class RequestService
7
+ ##
8
+ # Browserless.io strategy to request websites.
9
+ #
10
+ # Provide the WebSocket URL and your API token via environment variables:
11
+ # - BROWSERLESS_IO_WEBSOCKET_URL
12
+ # - BROWSERLESS_IO_API_TOKEN
13
+ #
14
+ # To use this strategy, you need to have a Browserless.io account or run a
15
+ # local Browserless.io instance.
16
+ #
17
+ # @see https://www.browserless.io/
18
+ #
19
+ # To run a local Browserless.io instance, you can use the following Docker command:
20
+ #
21
+ # ```sh
22
+ # docker run \
23
+ # --rm \
24
+ # -p 3000:3000 \
25
+ # -e "CONCURRENT=10" \
26
+ # -e "TOKEN=6R0W53R135510" \
27
+ # ghcr.io/browserless/chromium
28
+ # ```
29
+ #
30
+ # When running locally, you can skip setting the environment variables, as above commands
31
+ # are aligned with the default values.
32
+ # @see https://github.com/browserless/browserless/pkgs/container/chromium
33
+ class BrowserlessStrategy < Strategy
34
+ # return [Response]
35
+ def execute
36
+ Puppeteer.connect(browser_ws_endpoint:) do |browser|
37
+ PuppetCommander.new(ctx, browser).call
38
+ ensure
39
+ browser.disconnect
40
+ end
41
+ end
42
+
43
+ def browser_ws_endpoint
44
+ @browser_ws_endpoint ||= begin
45
+ api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
46
+ ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
47
+
48
+ "#{ws_url}?token=#{api_token}"
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable/uri'
4
+
5
+ module Html2rss
6
+ class RequestService
7
+ ##
8
+ # Holds information needed to send requests to websites.
9
+ # To be passed down to the RequestService's strategies.
10
+ class Context
11
+ SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
12
+
13
+ ##
14
+ # @param url [String, Addressable::URI] the URL to request
15
+ # @param headers [Hash] HTTP request headers
16
+ def initialize(url:, headers: {})
17
+ @url = Addressable::URI.parse(url)
18
+ assert_valid_url!
19
+
20
+ @headers = headers
21
+ end
22
+
23
+ # @return [Addressable::URI] the parsed URL
24
+ attr_reader :url
25
+
26
+ # @return [Hash] the HTTP request headers
27
+ attr_reader :headers
28
+
29
+ private
30
+
31
+ ##
32
+ # Validates the URL.
33
+ # @raise [InvalidUrl] if the URL is not valid
34
+ # @raise [UnsupportedUrlScheme] if the URL scheme is not supported
35
+ def assert_valid_url!
36
+ raise InvalidUrl, 'URL must be absolute' unless url.absolute?
37
+ raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
38
+
39
+ return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
40
+
41
+ raise UnsupportedUrlScheme,
42
+ "URL scheme '#{url.scheme}' is not supported"
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'faraday/follow_redirects'
5
+
6
+ module Html2rss
7
+ class RequestService
8
+ ##
9
+ # Strategy to use Faraday for the request.
10
+ # @see https://rubygems.org/gems/faraday
11
+ class FaradayStrategy < Strategy
12
+ # return [Response]
13
+ def execute
14
+ request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
15
+ faraday.use Faraday::FollowRedirects::Middleware
16
+ faraday.adapter Faraday.default_adapter
17
+ end
18
+ response = request.get
19
+
20
+ Response.new(body: response.body, headers: response.headers)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Commands the Puppeteer Browser to the website and builds the Response.
7
+ class PuppetCommander
8
+ # @param ctx [Context]
9
+ # @param browser [Puppeteer::Browser]
10
+ # @param skip_request_resources [Set<String>] the resource types not to request
11
+ # @param referer [String] the referer to use for the request
12
+ def initialize(ctx,
13
+ browser,
14
+ skip_request_resources: %w[stylesheet image media font].to_set,
15
+ referer: [ctx.url.scheme, ctx.url.host].join('://'))
16
+ @ctx = ctx
17
+ @browser = browser
18
+ @skip_request_resources = skip_request_resources
19
+ @referer = referer
20
+ end
21
+
22
+ # @return [Response]
23
+ def call
24
+ page = new_page
25
+
26
+ response = navigate_to_destination(page, ctx.url)
27
+
28
+ Response.new(body: body(page), headers: response.headers)
29
+ ensure
30
+ page&.close
31
+ end
32
+
33
+ ##
34
+ # @return [Puppeteer::Page]
35
+ # @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
36
+ def new_page
37
+ page = browser.new_page
38
+ page.extra_http_headers = ctx.headers
39
+
40
+ return page if skip_request_resources.empty?
41
+
42
+ page.request_interception = true
43
+ page.on('request') do |request|
44
+ skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
45
+ end
46
+
47
+ page
48
+ end
49
+
50
+ def navigate_to_destination(page, url)
51
+ page.goto(url, wait_until: 'networkidle0', referer:)
52
+ end
53
+
54
+ def body(page) = page.content
55
+
56
+ private
57
+
58
+ attr_reader :ctx, :browser, :skip_request_resources, :referer
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # To be used by strategies to provide their response.
7
+ class Response
8
+ ##
9
+ # @param body [String] the body of the response
10
+ # @param headers [Hash] the headers of the response
11
+ def initialize(body:, headers: {})
12
+ @body = body
13
+
14
+ headers = headers.dup
15
+ headers.transform_keys!(&:to_s)
16
+
17
+ @headers = headers
18
+ end
19
+
20
+ # @return [String] the body of the response
21
+ attr_reader :body
22
+
23
+ # @return [Hash<String, Object>] the headers of the response
24
+ attr_reader :headers
25
+ end
26
+ end
27
+ end