html2rss 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +112 -44
  3. data/html2rss.gemspec +3 -2
  4. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
  5. data/lib/html2rss/auto_source/article.rb +37 -5
  6. data/lib/html2rss/auto_source/channel.rb +21 -28
  7. data/lib/html2rss/auto_source/cleanup.rb +0 -16
  8. data/lib/html2rss/auto_source/rss_builder.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/html.rb +21 -12
  10. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
  11. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
  12. data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
  13. data/lib/html2rss/auto_source/scraper/schema.rb +22 -34
  14. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +41 -41
  15. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
  17. data/lib/html2rss/auto_source.rb +0 -7
  18. data/lib/html2rss/cli.rb +11 -4
  19. data/lib/html2rss/config/channel.rb +7 -1
  20. data/lib/html2rss/config/selectors.rb +2 -1
  21. data/lib/html2rss/config.rb +1 -0
  22. data/lib/html2rss/item.rb +7 -2
  23. data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
  24. data/lib/html2rss/request_service/context.rb +46 -0
  25. data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
  26. data/lib/html2rss/request_service/puppet_commander.rb +61 -0
  27. data/lib/html2rss/request_service/response.rb +27 -0
  28. data/lib/html2rss/request_service/strategy.rb +28 -0
  29. data/lib/html2rss/request_service.rb +97 -0
  30. data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
  31. data/lib/html2rss/utils.rb +23 -26
  32. data/lib/html2rss/version.rb +1 -1
  33. data/lib/html2rss.rb +5 -5
  34. metadata +31 -11
  35. data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class Schema
9
+ ##
10
+ # A Thing is kind of the 'base class' for Schema.org schema_objects.
11
+ #
12
+ # @see https://schema.org/Thing
13
+ class Thing
14
+ SUPPORTED_TYPES = %w[
15
+ AdvertiserContentArticle
16
+ AnalysisNewsArticle
17
+ APIReference
18
+ Article
19
+ AskPublicNewsArticle
20
+ BackgroundNewsArticle
21
+ BlogPosting
22
+ DiscussionForumPosting
23
+ LiveBlogPosting
24
+ NewsArticle
25
+ OpinionNewsArticle
26
+ Report
27
+ ReportageNewsArticle
28
+ ReviewNewsArticle
29
+ SatiricalArticle
30
+ ScholarlyArticle
31
+ SocialMediaPosting
32
+ TechArticle
33
+ ].to_set.freeze
34
+
35
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
36
+
37
+ def initialize(schema_object, url:)
38
+ @schema_object = schema_object
39
+ @url = url
40
+ end
41
+
42
+ # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
43
+ def call
44
+ DEFAULT_ATTRIBUTES.to_h do |attribute|
45
+ [attribute, public_send(attribute)]
46
+ end
47
+ end
48
+
49
+ def id
50
+ return @id if defined?(@id)
51
+
52
+ id = (schema_object[:@id] || url&.path).to_s
53
+
54
+ return if id.empty?
55
+
56
+ @id = id
57
+ end
58
+
59
+ def title = schema_object[:title]
60
+
61
+ def description
62
+ schema_object.values_at(:description, :schema_object_body, :abstract)
63
+ .max_by { |string| string.to_s.size }
64
+ end
65
+
66
+ # @return [Addressable::URI, nil] the URL of the schema object
67
+ def url
68
+ url = schema_object[:url]
69
+ if url.to_s.empty?
70
+ Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
71
+ return
72
+ end
73
+
74
+ Utils.build_absolute_url_from_relative(url, @url)
75
+ end
76
+
77
+ def image
78
+ if (image_url = image_urls.first)
79
+ Utils.build_absolute_url_from_relative(image_url, @url)
80
+ end
81
+ end
82
+
83
+ def published_at = schema_object[:datePublished]
84
+
85
+ private
86
+
87
+ attr_reader :schema_object
88
+
89
+ def image_urls
90
+ schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
91
+ next unless object
92
+
93
+ if object.is_a?(String)
94
+ object
95
+ elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
96
+ object[:url] || object[:contentUrl]
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -2,58 +2,38 @@
2
2
 
3
3
  require 'json'
4
4
  require 'nokogiri'
5
- require 'set'
6
5
 
7
6
  module Html2rss
8
7
  class AutoSource
9
8
  module Scraper
10
9
  ##
11
- # Scraps articles from Schema.org objects, by looking for the objects in:
10
+ # Scrapes articles from Schema.org objects, by looking for the objects in:
12
11
 
13
- # 1. <script type="application/ld+json"> "schema" tag.
14
- # 2. tbd
12
+ # <script type="application/ld+json"> "schema" tags.
15
13
  #
16
14
  # See:
17
- # 1. https://schema.org/NewsArticle
15
+ # 1. https://schema.org/docs/full.html
18
16
  # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
19
17
  class Schema
20
18
  include Enumerable
21
19
 
22
20
  TAG_SELECTOR = 'script[type="application/ld+json"]'
23
- SCHEMA_OBJECT_TYPES = %w[
24
- AdvertiserContentArticle
25
- AnalysisNewsArticle
26
- APIReference
27
- Article
28
- AskPublicNewsArticle
29
- BackgroundNewsArticle
30
- BlogPosting
31
- DiscussionForumPosting
32
- LiveBlogPosting
33
- NewsArticle
34
- OpinionNewsArticle
35
- Report
36
- ReportageNewsArticle
37
- ReviewNewsArticle
38
- SatiricalArticle
39
- ScholarlyArticle
40
- SocialMediaPosting
41
- TechArticle
42
- ].to_set.freeze
43
21
 
44
22
  class << self
45
23
  def articles?(parsed_body)
46
24
  parsed_body.css(TAG_SELECTOR).any? do |script|
47
- SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
25
+ (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
26
+ script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
27
+ end
48
28
  end
49
29
  end
50
30
 
51
31
  ##
52
32
  # Returns a flat array
53
33
  # of all supported schema objects
54
- # by recursively traversing the `from` object.
34
+ # by recursively traversing the given `object`.
55
35
  #
56
- # @param object [Hash, Array]
36
+ # @param object [Hash, Array, Nokogiri::XML::Element]
57
37
  # @return [Array<Hash>] the schema_objects, or an empty array
58
38
  # :reek:DuplicateMethodCall
59
39
  def from(object)
@@ -74,12 +54,16 @@ module Html2rss
74
54
  end
75
55
 
76
56
  ##
77
- # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
57
+ # @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
78
58
  def scraper_for_schema_object(schema_object)
79
- if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
80
- Base
59
+ type = schema_object[:@type]
60
+
61
+ if Thing::SUPPORTED_TYPES.member?(type)
62
+ Thing
63
+ elsif ItemList::SUPPORTED_TYPES.member?(type)
64
+ ItemList
81
65
  else
82
- Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
66
+ Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
83
67
  nil
84
68
  end
85
69
  end
@@ -107,9 +91,13 @@ module Html2rss
107
91
 
108
92
  schema_objects.filter_map do |schema_object|
109
93
  next unless (klass = self.class.scraper_for_schema_object(schema_object))
110
- next unless (article_hash = klass.new(schema_object, url:).call)
94
+ next unless (results = klass.new(schema_object, url:).call)
111
95
 
112
- yield article_hash
96
+ if results.is_a?(Array)
97
+ results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
98
+ else
99
+ yield(results)
100
+ end
113
101
  end
114
102
  end
115
103
 
@@ -1,7 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'set'
4
-
5
3
  module Html2rss
6
4
  class AutoSource
7
5
  module Scraper
@@ -33,6 +31,8 @@ module Html2rss
33
31
  end
34
32
 
35
33
  def initialize(article_tag, url:)
34
+ raise ArgumentError, 'article_tag is required' unless article_tag
35
+
36
36
  @article_tag = article_tag
37
37
  @url = url
38
38
  end
@@ -57,20 +57,6 @@ module Html2rss
57
57
 
58
58
  attr_reader :article_tag, :url, :heading, :extract_url
59
59
 
60
- def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
61
-
62
- # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
63
- def extract_published_at
64
- times = article_tag.css('time[datetime]')
65
- .filter_map do |tag|
66
- DateTime.parse(tag['datetime'])
67
- rescue ArgumentError, TypeError
68
- nil
69
- end
70
-
71
- times.min
72
- end
73
-
74
60
  ##
75
61
  # Find the heading of the article.
76
62
  # @return [Nokogiri::XML::Node, nil]
@@ -80,18 +66,36 @@ module Html2rss
80
66
  return if heading_tags.empty?
81
67
 
82
68
  smallest_heading = heading_tags.keys.min
83
- heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
69
+ heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size.to_i }
70
+ end
71
+
72
+ def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
73
+
74
+ def closest_anchor
75
+ SemanticHtml.find_closest_selector(heading || article_tag,
76
+ selector: 'a[href]:not([href=""])')
77
+ end
78
+
79
+ def find_url
80
+ href = closest_anchor&.[]('href')
81
+
82
+ return if (parts = href.to_s.split('#')).empty?
83
+
84
+ Utils.build_absolute_url_from_relative(parts.first.strip, url)
84
85
  end
85
86
 
86
87
  def extract_title
87
- @extract_title ||= if heading && (heading.children.empty? || heading.text)
88
- visible_text_from_tag(heading)
89
- else
90
- visible_text_from_tag(
91
- article_tag.css(HEADING_TAGS.join(','))
92
- .max_by { |tag| tag.text.size }
93
- )
94
- end
88
+ if heading && (heading.children.empty? || heading.text)
89
+ visible_text_from_tag(heading)
90
+ else
91
+ visible_text_from_tag(article_tag.css(HEADING_TAGS.join(','))
92
+ .max_by { |tag| tag.text.size })
93
+
94
+ end
95
+ end
96
+
97
+ def extract_image
98
+ Image.call(article_tag, url:)
95
99
  end
96
100
 
97
101
  def extract_description
@@ -101,26 +105,10 @@ module Html2rss
101
105
  description = visible_text_from_tag(article_tag)
102
106
  return nil unless description
103
107
 
104
- title_text = extract_title
105
- description.gsub!(title_text, '') if title_text
106
108
  description.strip!
107
109
  description.empty? ? nil : description
108
110
  end
109
111
 
110
- def closest_anchor
111
- SemanticHtml.find_closest_selector(heading || article_tag,
112
- selector: 'a[href]:not([href=""])')
113
- end
114
-
115
- def find_url
116
- href = closest_anchor&.[]('href')&.split('#')&.first&.strip
117
- Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
118
- end
119
-
120
- def extract_image
121
- Image.call(article_tag, url:)
122
- end
123
-
124
112
  def generate_id
125
113
  [
126
114
  article_tag['id'],
@@ -129,6 +117,18 @@ module Html2rss
129
117
  extract_url&.query
130
118
  ].compact.reject(&:empty?).first
131
119
  end
120
+
121
+ # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
122
+ def extract_published_at
123
+ times = article_tag.css('time[datetime]')
124
+ .filter_map do |tag|
125
+ DateTime.parse(tag['datetime'])
126
+ rescue ArgumentError, TypeError
127
+ nil
128
+ end
129
+
130
+ times.min
131
+ end
132
132
  end
133
133
  end
134
134
  end
@@ -28,14 +28,14 @@ module Html2rss
28
28
  # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
29
29
  def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
30
30
  hash = article_tag.css('img[srcset], picture > source[srcset]')
31
- .flat_map { |source| source['srcset'].to_s.split(',') }
32
- .filter_map do |line|
33
- width, url = line.split.reverse
34
- next if url.nil? || url.start_with?('data:')
31
+ .flat_map do |source|
32
+ source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)/).map do |url, width|
33
+ next if url.nil? || url.start_with?('data:')
35
34
 
36
- width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
35
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
37
36
 
38
- [width_value, url.strip]
37
+ [width_value, url.strip]
38
+ end
39
39
  end.to_h
40
40
 
41
41
  hash[hash.keys.max]
@@ -106,9 +106,10 @@ module Html2rss
106
106
  SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
107
  parsed_body.css(selector).each do |selected_tag|
108
108
  article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
109
- article_hash = Extractor.new(article_tag, url: @url).call
110
109
 
111
- yield article_hash if article_hash
110
+ if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
111
+ yield article_hash
112
+ end
112
113
  end
113
114
  end
114
115
  end
@@ -11,20 +11,13 @@ module Html2rss
11
11
  # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
12
  # marking articles, e.g. schema, microdata, open graph, etc.
13
13
  class AutoSource
14
- class UnsupportedUrlScheme < Html2rss::Error; end
15
14
  class NoArticlesFound < Html2rss::Error; end
16
15
 
17
- SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
-
19
16
  ##
20
17
  # @param url [Addressable::URI] The URL to extract articles from.
21
18
  # @param body [String] The body of the response.
22
19
  # @param headers [Hash] The headers of the response.
23
20
  def initialize(url, body:, headers: {})
24
- raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
25
- raise ArgumentError, 'URL must be absolute' unless url.absolute?
26
- raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
27
-
28
21
  @url = url
29
22
  @body = body
30
23
  @headers = headers
data/lib/html2rss/cli.rb CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  require_relative '../html2rss'
4
4
  require 'thor'
5
- require 'addressable'
6
5
 
7
6
  ##
8
7
  # The Html2rss namespace / command line interface.
@@ -26,14 +25,22 @@ module Html2rss
26
25
  def feed(yaml_file, *options)
27
26
  raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
28
27
 
29
- feed_name = options.shift
28
+ feed_name = options.shift unless options.first&.include?('=')
30
29
  params = options.to_h { |opt| opt.split('=', 2) }
30
+
31
31
  puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
32
32
  end
33
33
 
34
- desc 'auto URL', 'automatically sources an RSS feed from the URL'
34
+ desc 'auto URL', 'Automatically sources an RSS feed from the URL'
35
+ method_option :strategy,
36
+ type: :string,
37
+ desc: 'The strategy to request the URL',
38
+ enum: RequestService.strategy_names,
39
+ default: RequestService.default_strategy_name
35
40
  def auto(url)
36
- puts Html2rss.auto_source(url)
41
+ strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
42
+
43
+ puts Html2rss.auto_source(url, strategy:)
37
44
  end
38
45
  end
39
46
  end
@@ -55,7 +55,7 @@ module Html2rss
55
55
  ##
56
56
  # @return [String]
57
57
  def title
58
- config.fetch(:title) { Utils.titleized_url(url) }
58
+ config.fetch(:title) { Utils.titleized_channel_url(url) }
59
59
  end
60
60
 
61
61
  ##
@@ -88,6 +88,12 @@ module Html2rss
88
88
  config.fetch(:json, false)
89
89
  end
90
90
 
91
+ ##
92
+ # @return [Symbol]
93
+ def strategy
94
+ config.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
95
+ end
96
+
91
97
  private
92
98
 
93
99
  # @return [Hash<Symbol, Object>]
@@ -8,7 +8,8 @@ module Html2rss
8
8
  ITEMS_SELECTOR_NAME = :items
9
9
 
10
10
  # Struct to represent a selector with associated attributes for extraction and processing.
11
- Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
11
+ Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
12
+ keyword_init: true)
12
13
 
13
14
  # raised when an invalid selector name is used
14
15
  class InvalidSelectorName < Html2rss::Error; end
@@ -27,6 +27,7 @@ module Html2rss
27
27
  def_delegator :@channel, :url, :link
28
28
  def_delegator :@channel, :time_zone
29
29
  def_delegator :@channel, :json?
30
+ def_delegator :@channel, :strategy
30
31
 
31
32
  def_delegator :@selectors, :item_selector_names
32
33
  def_delegator :@selectors, :selector?
data/lib/html2rss/item.rb CHANGED
@@ -23,7 +23,9 @@ module Html2rss
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
26
- body = Utils.request_url(url, headers: config.headers).body
26
+ ctx = RequestService::Context.new(url:, headers: config.headers)
27
+
28
+ body = RequestService.execute(ctx, strategy: config.strategy).body
27
29
  body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
28
30
 
29
31
  Nokogiri.HTML(body)
@@ -136,8 +138,11 @@ module Html2rss
136
138
 
137
139
  raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
138
140
 
141
+ type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
142
+ Html2rss::Utils.guess_content_type_from_url(url)
143
+
139
144
  Enclosure.new(
140
- type: Html2rss::Utils.guess_content_type_from_url(url),
145
+ type:,
141
146
  bits_length: 0,
142
147
  url: url.to_s
143
148
  )
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'puppeteer'
4
+
5
+ module Html2rss
6
+ class RequestService
7
+ ##
8
+ # Browserless.io strategy to request websites.
9
+ #
10
+ # Provide the WebSocket URL and your API token via environment variables:
11
+ # - BROWSERLESS_IO_WEBSOCKET_URL
12
+ # - BROWSERLESS_IO_API_TOKEN
13
+ #
14
+ # To use this strategy, you need to have a Browserless.io account or run a
15
+ # local Browserless.io instance.
16
+ #
17
+ # @see https://www.browserless.io/
18
+ #
19
+ # To run a local Browserless.io instance, you can use the following Docker command:
20
+ #
21
+ # ```sh
22
+ # docker run \
23
+ # --rm \
24
+ # -p 3000:3000 \
25
+ # -e "CONCURRENT=10" \
26
+ # -e "TOKEN=6R0W53R135510" \
27
+ # ghcr.io/browserless/chromium
28
+ # ```
29
+ #
30
+ # When running locally, you can skip setting the environment variables, as above commands
31
+ # are aligned with the default values.
32
+ # @see https://github.com/browserless/browserless/pkgs/container/chromium
33
+ class BrowserlessStrategy < Strategy
34
+ # return [Response]
35
+ def execute
36
+ Puppeteer.connect(browser_ws_endpoint:) do |browser|
37
+ PuppetCommander.new(ctx, browser).call
38
+ ensure
39
+ browser.disconnect
40
+ end
41
+ end
42
+
43
+ def browser_ws_endpoint
44
+ @browser_ws_endpoint ||= begin
45
+ api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
46
+ ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
47
+
48
+ "#{ws_url}?token=#{api_token}"
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable/uri'
4
+
5
+ module Html2rss
6
+ class RequestService
7
+ ##
8
+ # Holds information needed to send requests to websites.
9
+ # To be passed down to the RequestService's strategies.
10
+ class Context
11
+ SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
12
+
13
+ ##
14
+ # @param url [String, Addressable::URI] the URL to request
15
+ # @param headers [Hash] HTTP request headers
16
+ def initialize(url:, headers: {})
17
+ @url = Addressable::URI.parse(url)
18
+ assert_valid_url!
19
+
20
+ @headers = headers
21
+ end
22
+
23
+ # @return [Addressable::URI] the parsed URL
24
+ attr_reader :url
25
+
26
+ # @return [Hash] the HTTP request headers
27
+ attr_reader :headers
28
+
29
+ private
30
+
31
+ ##
32
+ # Validates the URL.
33
+ # @raise [InvalidUrl] if the URL is not valid
34
+ # @raise [UnsupportedUrlScheme] if the URL scheme is not supported
35
+ def assert_valid_url!
36
+ raise InvalidUrl, 'URL must be absolute' unless url.absolute?
37
+ raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
38
+
39
+ return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
40
+
41
+ raise UnsupportedUrlScheme,
42
+ "URL scheme '#{url.scheme}' is not supported"
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'faraday/follow_redirects'
5
+
6
+ module Html2rss
7
+ class RequestService
8
+ ##
9
+ # Strategy to use Faraday for the request.
10
+ # @see https://rubygems.org/gems/faraday
11
+ class FaradayStrategy < Strategy
12
+ # return [Response]
13
+ def execute
14
+ request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
15
+ faraday.use Faraday::FollowRedirects::Middleware
16
+ faraday.adapter Faraday.default_adapter
17
+ end
18
+ response = request.get
19
+
20
+ Response.new(body: response.body, headers: response.headers)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Commands the Puppeteer Browser to the website and builds the Response.
7
+ class PuppetCommander
8
+ # @param ctx [Context]
9
+ # @param browser [Puppeteer::Browser]
10
+ # @param skip_request_resources [Set<String>] the resource types not to request
11
+ # @param referer [String] the referer to use for the request
12
+ def initialize(ctx,
13
+ browser,
14
+ skip_request_resources: %w[stylesheet image media font].to_set,
15
+ referer: [ctx.url.scheme, ctx.url.host].join('://'))
16
+ @ctx = ctx
17
+ @browser = browser
18
+ @skip_request_resources = skip_request_resources
19
+ @referer = referer
20
+ end
21
+
22
+ # @return [Response]
23
+ def call
24
+ page = new_page
25
+
26
+ response = navigate_to_destination(page, ctx.url)
27
+
28
+ Response.new(body: body(page), headers: response.headers)
29
+ ensure
30
+ page&.close
31
+ end
32
+
33
+ ##
34
+ # @return [Puppeteer::Page]
35
+ # @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
36
+ def new_page
37
+ page = browser.new_page
38
+ page.extra_http_headers = ctx.headers
39
+
40
+ return page if skip_request_resources.empty?
41
+
42
+ page.request_interception = true
43
+ page.on('request') do |request|
44
+ skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
45
+ end
46
+
47
+ page
48
+ end
49
+
50
+ def navigate_to_destination(page, url)
51
+ page.goto(url, wait_until: 'networkidle0', referer:)
52
+ end
53
+
54
+ def body(page) = page.content
55
+
56
+ private
57
+
58
+ attr_reader :ctx, :browser, :skip_request_resources, :referer
59
+ end
60
+ end
61
+ end