html2rss 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +113 -44
- data/html2rss.gemspec +3 -2
- data/lib/html2rss/auto_source/article.rb +37 -5
- data/lib/html2rss/auto_source/channel.rb +21 -28
- data/lib/html2rss/auto_source/cleanup.rb +0 -16
- data/lib/html2rss/auto_source/rss_builder.rb +1 -1
- data/lib/html2rss/auto_source/scraper/html.rb +96 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +51 -38
- data/lib/html2rss/auto_source/scraper.rb +1 -0
- data/lib/html2rss/auto_source.rb +0 -7
- data/lib/html2rss/cli.rb +11 -4
- data/lib/html2rss/config/channel.rb +7 -1
- data/lib/html2rss/config/selectors.rb +2 -1
- data/lib/html2rss/config.rb +1 -0
- data/lib/html2rss/item.rb +7 -2
- data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
- data/lib/html2rss/request_service/context.rb +46 -0
- data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
- data/lib/html2rss/request_service/puppet_commander.rb +61 -0
- data/lib/html2rss/request_service/response.rb +27 -0
- data/lib/html2rss/request_service/strategy.rb +28 -0
- data/lib/html2rss/request_service.rb +97 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
- data/lib/html2rss/utils.rb +23 -26
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +7 -6
- metadata +35 -11
- data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
module Scraper
|
8
|
+
class Schema
|
9
|
+
##
|
10
|
+
# A Thing is kind of the 'base class' for Schema.org schema_objects.
|
11
|
+
#
|
12
|
+
# @see https://schema.org/Thing
|
13
|
+
class Thing
|
14
|
+
SUPPORTED_TYPES = %w[
|
15
|
+
AdvertiserContentArticle
|
16
|
+
AnalysisNewsArticle
|
17
|
+
APIReference
|
18
|
+
Article
|
19
|
+
AskPublicNewsArticle
|
20
|
+
BackgroundNewsArticle
|
21
|
+
BlogPosting
|
22
|
+
DiscussionForumPosting
|
23
|
+
LiveBlogPosting
|
24
|
+
NewsArticle
|
25
|
+
OpinionNewsArticle
|
26
|
+
Report
|
27
|
+
ReportageNewsArticle
|
28
|
+
ReviewNewsArticle
|
29
|
+
SatiricalArticle
|
30
|
+
ScholarlyArticle
|
31
|
+
SocialMediaPosting
|
32
|
+
TechArticle
|
33
|
+
].to_set.freeze
|
34
|
+
|
35
|
+
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
36
|
+
|
37
|
+
def initialize(schema_object, url:)
|
38
|
+
@schema_object = schema_object
|
39
|
+
@url = url
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
43
|
+
def call
|
44
|
+
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
45
|
+
[attribute, public_send(attribute)]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
return @id if defined?(@id)
|
51
|
+
|
52
|
+
id = (schema_object[:@id] || url&.path).to_s
|
53
|
+
|
54
|
+
return if id.empty?
|
55
|
+
|
56
|
+
@id = id
|
57
|
+
end
|
58
|
+
|
59
|
+
def title = schema_object[:title]
|
60
|
+
|
61
|
+
def description
|
62
|
+
schema_object.values_at(:description, :schema_object_body, :abstract)
|
63
|
+
.max_by { |string| string.to_s.size }
|
64
|
+
end
|
65
|
+
|
66
|
+
# @return [Addressable::URI, nil] the URL of the schema object
|
67
|
+
def url
|
68
|
+
url = schema_object[:url]
|
69
|
+
if url.to_s.empty?
|
70
|
+
Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
|
71
|
+
return
|
72
|
+
end
|
73
|
+
|
74
|
+
Utils.build_absolute_url_from_relative(url, @url)
|
75
|
+
end
|
76
|
+
|
77
|
+
def image
|
78
|
+
if (image_url = image_urls.first)
|
79
|
+
Utils.build_absolute_url_from_relative(image_url, @url)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def published_at = schema_object[:datePublished]
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
attr_reader :schema_object
|
88
|
+
|
89
|
+
def image_urls
|
90
|
+
schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
|
91
|
+
next unless object
|
92
|
+
|
93
|
+
if object.is_a?(String)
|
94
|
+
object
|
95
|
+
elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
|
96
|
+
object[:url] || object[:contentUrl]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -8,52 +8,33 @@ module Html2rss
|
|
8
8
|
class AutoSource
|
9
9
|
module Scraper
|
10
10
|
##
|
11
|
-
#
|
11
|
+
# Scrapes articles from Schema.org objects, by looking for the objects in:
|
12
12
|
|
13
|
-
#
|
14
|
-
# 2. tbd
|
13
|
+
# <script type="application/ld+json"> "schema" tags.
|
15
14
|
#
|
16
15
|
# See:
|
17
|
-
# 1. https://schema.org/
|
16
|
+
# 1. https://schema.org/docs/full.html
|
18
17
|
# 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
19
18
|
class Schema
|
20
19
|
include Enumerable
|
21
20
|
|
22
21
|
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
23
|
-
SCHEMA_OBJECT_TYPES = %w[
|
24
|
-
AdvertiserContentArticle
|
25
|
-
AnalysisNewsArticle
|
26
|
-
APIReference
|
27
|
-
Article
|
28
|
-
AskPublicNewsArticle
|
29
|
-
BackgroundNewsArticle
|
30
|
-
BlogPosting
|
31
|
-
DiscussionForumPosting
|
32
|
-
LiveBlogPosting
|
33
|
-
NewsArticle
|
34
|
-
OpinionNewsArticle
|
35
|
-
Report
|
36
|
-
ReportageNewsArticle
|
37
|
-
ReviewNewsArticle
|
38
|
-
SatiricalArticle
|
39
|
-
ScholarlyArticle
|
40
|
-
SocialMediaPosting
|
41
|
-
TechArticle
|
42
|
-
].to_set.freeze
|
43
22
|
|
44
23
|
class << self
|
45
24
|
def articles?(parsed_body)
|
46
25
|
parsed_body.css(TAG_SELECTOR).any? do |script|
|
47
|
-
|
26
|
+
(Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
|
27
|
+
script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
|
28
|
+
end
|
48
29
|
end
|
49
30
|
end
|
50
31
|
|
51
32
|
##
|
52
33
|
# Returns a flat array
|
53
34
|
# of all supported schema objects
|
54
|
-
# by recursively traversing the `
|
35
|
+
# by recursively traversing the given `object`.
|
55
36
|
#
|
56
|
-
# @param object [Hash, Array]
|
37
|
+
# @param object [Hash, Array, Nokogiri::XML::Element]
|
57
38
|
# @return [Array<Hash>] the schema_objects, or an empty array
|
58
39
|
# :reek:DuplicateMethodCall
|
59
40
|
def from(object)
|
@@ -74,12 +55,16 @@ module Html2rss
|
|
74
55
|
end
|
75
56
|
|
76
57
|
##
|
77
|
-
# @return [Scraper::Schema::
|
58
|
+
# @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
|
78
59
|
def scraper_for_schema_object(schema_object)
|
79
|
-
|
80
|
-
|
60
|
+
type = schema_object[:@type]
|
61
|
+
|
62
|
+
if Thing::SUPPORTED_TYPES.member?(type)
|
63
|
+
Thing
|
64
|
+
elsif ItemList::SUPPORTED_TYPES.member?(type)
|
65
|
+
ItemList
|
81
66
|
else
|
82
|
-
Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{
|
67
|
+
Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
|
83
68
|
nil
|
84
69
|
end
|
85
70
|
end
|
@@ -107,9 +92,13 @@ module Html2rss
|
|
107
92
|
|
108
93
|
schema_objects.filter_map do |schema_object|
|
109
94
|
next unless (klass = self.class.scraper_for_schema_object(schema_object))
|
110
|
-
next unless (
|
95
|
+
next unless (results = klass.new(schema_object, url:).call)
|
111
96
|
|
112
|
-
|
97
|
+
if results.is_a?(Array)
|
98
|
+
results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
|
99
|
+
else
|
100
|
+
yield(results)
|
101
|
+
end
|
113
102
|
end
|
114
103
|
end
|
115
104
|
|
@@ -35,13 +35,13 @@ module Html2rss
|
|
35
35
|
def initialize(article_tag, url:)
|
36
36
|
@article_tag = article_tag
|
37
37
|
@url = url
|
38
|
-
@heading = find_heading
|
39
|
-
@extract_url = find_url
|
40
38
|
end
|
41
39
|
|
42
40
|
# @return [Hash, nil] The scraped article or nil.
|
43
41
|
def call
|
44
|
-
|
42
|
+
@heading = find_heading || closest_anchor || return
|
43
|
+
|
44
|
+
@extract_url = find_url
|
45
45
|
|
46
46
|
{
|
47
47
|
title: extract_title,
|
@@ -57,35 +57,45 @@ module Html2rss
|
|
57
57
|
|
58
58
|
attr_reader :article_tag, :url, :heading, :extract_url
|
59
59
|
|
60
|
-
|
60
|
+
##
|
61
|
+
# Find the heading of the article.
|
62
|
+
# @return [Nokogiri::XML::Node, nil]
|
63
|
+
def find_heading
|
64
|
+
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
61
65
|
|
62
|
-
|
63
|
-
def extract_published_at
|
64
|
-
times = article_tag.css('time[datetime]')
|
65
|
-
.filter_map do |tag|
|
66
|
-
DateTime.parse(tag['datetime'])
|
67
|
-
rescue ArgumentError, TypeError
|
68
|
-
nil
|
69
|
-
end
|
66
|
+
return if heading_tags.empty?
|
70
67
|
|
71
|
-
|
68
|
+
smallest_heading = heading_tags.keys.min
|
69
|
+
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size.to_i }
|
72
70
|
end
|
73
71
|
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
72
|
+
def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
|
73
|
+
|
74
|
+
def closest_anchor
|
75
|
+
SemanticHtml.find_closest_selector(heading || article_tag,
|
76
|
+
selector: 'a[href]:not([href=""])')
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_url
|
80
|
+
href = closest_anchor&.[]('href')
|
81
|
+
|
82
|
+
return if (parts = href.to_s.split('#')).empty?
|
83
|
+
|
84
|
+
Utils.build_absolute_url_from_relative(parts.first.strip, url)
|
78
85
|
end
|
79
86
|
|
80
87
|
def extract_title
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
88
|
+
if heading && (heading.children.empty? || heading.text)
|
89
|
+
visible_text_from_tag(heading)
|
90
|
+
else
|
91
|
+
visible_text_from_tag(article_tag.css(HEADING_TAGS.join(','))
|
92
|
+
.max_by { |tag| tag.text.size })
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def extract_image
|
98
|
+
Image.call(article_tag, url:)
|
89
99
|
end
|
90
100
|
|
91
101
|
def extract_description
|
@@ -95,26 +105,29 @@ module Html2rss
|
|
95
105
|
description = visible_text_from_tag(article_tag)
|
96
106
|
return nil unless description
|
97
107
|
|
98
|
-
title_text = extract_title
|
99
|
-
description.gsub!(title_text, '') if title_text
|
100
108
|
description.strip!
|
101
109
|
description.empty? ? nil : description
|
102
110
|
end
|
103
111
|
|
104
|
-
def
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
112
|
+
def generate_id
|
113
|
+
[
|
114
|
+
article_tag['id'],
|
115
|
+
article_tag.at_css('[id]')&.attr('id'),
|
116
|
+
extract_url&.path,
|
117
|
+
extract_url&.query
|
118
|
+
].compact.reject(&:empty?).first
|
109
119
|
end
|
110
120
|
|
111
|
-
|
112
|
-
|
113
|
-
|
121
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
|
122
|
+
def extract_published_at
|
123
|
+
times = article_tag.css('time[datetime]')
|
124
|
+
.filter_map do |tag|
|
125
|
+
DateTime.parse(tag['datetime'])
|
126
|
+
rescue ArgumentError, TypeError
|
127
|
+
nil
|
128
|
+
end
|
114
129
|
|
115
|
-
|
116
|
-
[article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
|
117
|
-
extract_url&.path].compact.reject(&:empty?).first
|
130
|
+
times.min
|
118
131
|
end
|
119
132
|
end
|
120
133
|
end
|
data/lib/html2rss/auto_source.rb
CHANGED
@@ -11,20 +11,13 @@ module Html2rss
|
|
11
11
|
# It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
|
12
12
|
# marking articles, e.g. schema, microdata, open graph, etc.
|
13
13
|
class AutoSource
|
14
|
-
class UnsupportedUrlScheme < Html2rss::Error; end
|
15
14
|
class NoArticlesFound < Html2rss::Error; end
|
16
15
|
|
17
|
-
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
|
-
|
19
16
|
##
|
20
17
|
# @param url [Addressable::URI] The URL to extract articles from.
|
21
18
|
# @param body [String] The body of the response.
|
22
19
|
# @param headers [Hash] The headers of the response.
|
23
20
|
def initialize(url, body:, headers: {})
|
24
|
-
raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
|
25
|
-
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
26
|
-
raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
27
|
-
|
28
21
|
@url = url
|
29
22
|
@body = body
|
30
23
|
@headers = headers
|
data/lib/html2rss/cli.rb
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
require_relative '../html2rss'
|
4
4
|
require 'thor'
|
5
|
-
require 'addressable'
|
6
5
|
|
7
6
|
##
|
8
7
|
# The Html2rss namespace / command line interface.
|
@@ -26,14 +25,22 @@ module Html2rss
|
|
26
25
|
def feed(yaml_file, *options)
|
27
26
|
raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
|
28
27
|
|
29
|
-
feed_name = options.shift
|
28
|
+
feed_name = options.shift unless options.first&.include?('=')
|
30
29
|
params = options.to_h { |opt| opt.split('=', 2) }
|
30
|
+
|
31
31
|
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
32
32
|
end
|
33
33
|
|
34
|
-
desc 'auto URL', '
|
34
|
+
desc 'auto URL', 'Automatically sources an RSS feed from the URL'
|
35
|
+
method_option :strategy,
|
36
|
+
type: :string,
|
37
|
+
desc: 'The strategy to request the URL',
|
38
|
+
enum: RequestService.strategy_names,
|
39
|
+
default: RequestService.default_strategy_name
|
35
40
|
def auto(url)
|
36
|
-
|
41
|
+
strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
|
42
|
+
|
43
|
+
puts Html2rss.auto_source(url, strategy:)
|
37
44
|
end
|
38
45
|
end
|
39
46
|
end
|
@@ -55,7 +55,7 @@ module Html2rss
|
|
55
55
|
##
|
56
56
|
# @return [String]
|
57
57
|
def title
|
58
|
-
config.fetch(:title) { Utils.
|
58
|
+
config.fetch(:title) { Utils.titleized_channel_url(url) }
|
59
59
|
end
|
60
60
|
|
61
61
|
##
|
@@ -88,6 +88,12 @@ module Html2rss
|
|
88
88
|
config.fetch(:json, false)
|
89
89
|
end
|
90
90
|
|
91
|
+
##
|
92
|
+
# @return [Symbol]
|
93
|
+
def strategy
|
94
|
+
config.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
|
95
|
+
end
|
96
|
+
|
91
97
|
private
|
92
98
|
|
93
99
|
# @return [Hash<Symbol, Object>]
|
@@ -8,7 +8,8 @@ module Html2rss
|
|
8
8
|
ITEMS_SELECTOR_NAME = :items
|
9
9
|
|
10
10
|
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
|
-
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static,
|
11
|
+
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
|
12
|
+
keyword_init: true)
|
12
13
|
|
13
14
|
# raised when an invalid selector name is used
|
14
15
|
class InvalidSelectorName < Html2rss::Error; end
|
data/lib/html2rss/config.rb
CHANGED
@@ -27,6 +27,7 @@ module Html2rss
|
|
27
27
|
def_delegator :@channel, :url, :link
|
28
28
|
def_delegator :@channel, :time_zone
|
29
29
|
def_delegator :@channel, :json?
|
30
|
+
def_delegator :@channel, :strategy
|
30
31
|
|
31
32
|
def_delegator :@selectors, :item_selector_names
|
32
33
|
def_delegator :@selectors, :selector?
|
data/lib/html2rss/item.rb
CHANGED
@@ -23,7 +23,9 @@ module Html2rss
|
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
26
|
-
|
26
|
+
ctx = RequestService::Context.new(url:, headers: config.headers)
|
27
|
+
|
28
|
+
body = RequestService.execute(ctx, strategy: config.strategy).body
|
27
29
|
body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
|
28
30
|
|
29
31
|
Nokogiri.HTML(body)
|
@@ -136,8 +138,11 @@ module Html2rss
|
|
136
138
|
|
137
139
|
raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
|
138
140
|
|
141
|
+
type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
|
142
|
+
Html2rss::Utils.guess_content_type_from_url(url)
|
143
|
+
|
139
144
|
Enclosure.new(
|
140
|
-
type
|
145
|
+
type:,
|
141
146
|
bits_length: 0,
|
142
147
|
url: url.to_s
|
143
148
|
)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'puppeteer'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class RequestService
|
7
|
+
##
|
8
|
+
# Browserless.io strategy to request websites.
|
9
|
+
#
|
10
|
+
# Provide the WebSocket URL and your API token via environment variables:
|
11
|
+
# - BROWSERLESS_IO_WEBSOCKET_URL
|
12
|
+
# - BROWSERLESS_IO_API_TOKEN
|
13
|
+
#
|
14
|
+
# To use this strategy, you need to have a Browserless.io account or run a
|
15
|
+
# local Browserless.io instance.
|
16
|
+
#
|
17
|
+
# @see https://www.browserless.io/
|
18
|
+
#
|
19
|
+
# To run a local Browserless.io instance, you can use the following Docker command:
|
20
|
+
#
|
21
|
+
# ```sh
|
22
|
+
# docker run \
|
23
|
+
# --rm \
|
24
|
+
# -p 3000:3000 \
|
25
|
+
# -e "CONCURRENT=10" \
|
26
|
+
# -e "TOKEN=6R0W53R135510" \
|
27
|
+
# ghcr.io/browserless/chromium
|
28
|
+
# ```
|
29
|
+
#
|
30
|
+
# When running locally, you can skip setting the environment variables, as above commands
|
31
|
+
# are aligned with the default values.
|
32
|
+
# @see https://github.com/browserless/browserless/pkgs/container/chromium
|
33
|
+
class BrowserlessStrategy < Strategy
|
34
|
+
# return [Response]
|
35
|
+
def execute
|
36
|
+
Puppeteer.connect(browser_ws_endpoint:) do |browser|
|
37
|
+
PuppetCommander.new(ctx, browser).call
|
38
|
+
ensure
|
39
|
+
browser.disconnect
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def browser_ws_endpoint
|
44
|
+
@browser_ws_endpoint ||= begin
|
45
|
+
api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
|
46
|
+
ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
|
47
|
+
|
48
|
+
"#{ws_url}?token=#{api_token}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable/uri'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class RequestService
|
7
|
+
##
|
8
|
+
# Holds information needed to send requests to websites.
|
9
|
+
# To be passed down to the RequestService's strategies.
|
10
|
+
class Context
|
11
|
+
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
12
|
+
|
13
|
+
##
|
14
|
+
# @param url [String, Addressable::URI] the URL to request
|
15
|
+
# @param headers [Hash] HTTP request headers
|
16
|
+
def initialize(url:, headers: {})
|
17
|
+
@url = Addressable::URI.parse(url)
|
18
|
+
assert_valid_url!
|
19
|
+
|
20
|
+
@headers = headers
|
21
|
+
end
|
22
|
+
|
23
|
+
# @return [Addressable::URI] the parsed URL
|
24
|
+
attr_reader :url
|
25
|
+
|
26
|
+
# @return [Hash] the HTTP request headers
|
27
|
+
attr_reader :headers
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# Validates the URL.
|
33
|
+
# @raise [InvalidUrl] if the URL is not valid
|
34
|
+
# @raise [UnsupportedUrlScheme] if the URL scheme is not supported
|
35
|
+
def assert_valid_url!
|
36
|
+
raise InvalidUrl, 'URL must be absolute' unless url.absolute?
|
37
|
+
raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
|
38
|
+
|
39
|
+
return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
40
|
+
|
41
|
+
raise UnsupportedUrlScheme,
|
42
|
+
"URL scheme '#{url.scheme}' is not supported"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday'
|
4
|
+
require 'faraday/follow_redirects'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class RequestService
|
8
|
+
##
|
9
|
+
# Strategy to use Faraday for the request.
|
10
|
+
# @see https://rubygems.org/gems/faraday
|
11
|
+
class FaradayStrategy < Strategy
|
12
|
+
# return [Response]
|
13
|
+
def execute
|
14
|
+
request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
|
15
|
+
faraday.use Faraday::FollowRedirects::Middleware
|
16
|
+
faraday.adapter Faraday.default_adapter
|
17
|
+
end
|
18
|
+
response = request.get
|
19
|
+
|
20
|
+
Response.new(body: response.body, headers: response.headers)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class RequestService
|
5
|
+
##
|
6
|
+
# Commands the Puppeteer Browser to the website and builds the Response.
|
7
|
+
class PuppetCommander
|
8
|
+
# @param ctx [Context]
|
9
|
+
# @param browser [Puppeteer::Browser]
|
10
|
+
# @param skip_request_resources [Set<String>] the resource types not to request
|
11
|
+
# @param referer [String] the referer to use for the request
|
12
|
+
def initialize(ctx,
|
13
|
+
browser,
|
14
|
+
skip_request_resources: %w[stylesheet image media font].to_set,
|
15
|
+
referer: [ctx.url.scheme, ctx.url.host].join('://'))
|
16
|
+
@ctx = ctx
|
17
|
+
@browser = browser
|
18
|
+
@skip_request_resources = skip_request_resources
|
19
|
+
@referer = referer
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Response]
|
23
|
+
def call
|
24
|
+
page = new_page
|
25
|
+
|
26
|
+
response = navigate_to_destination(page, ctx.url)
|
27
|
+
|
28
|
+
Response.new(body: body(page), headers: response.headers)
|
29
|
+
ensure
|
30
|
+
page&.close
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# @return [Puppeteer::Page]
|
35
|
+
# @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
|
36
|
+
def new_page
|
37
|
+
page = browser.new_page
|
38
|
+
page.extra_http_headers = ctx.headers
|
39
|
+
|
40
|
+
return page if skip_request_resources.empty?
|
41
|
+
|
42
|
+
page.request_interception = true
|
43
|
+
page.on('request') do |request|
|
44
|
+
skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
|
45
|
+
end
|
46
|
+
|
47
|
+
page
|
48
|
+
end
|
49
|
+
|
50
|
+
def navigate_to_destination(page, url)
|
51
|
+
page.goto(url, wait_until: 'networkidle0', referer:)
|
52
|
+
end
|
53
|
+
|
54
|
+
def body(page) = page.content
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
attr_reader :ctx, :browser, :skip_request_resources, :referer
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class RequestService
|
5
|
+
##
|
6
|
+
# To be used by strategies to provide their response.
|
7
|
+
class Response
|
8
|
+
##
|
9
|
+
# @param body [String] the body of the response
|
10
|
+
# @param headers [Hash] the headers of the response
|
11
|
+
def initialize(body:, headers: {})
|
12
|
+
@body = body
|
13
|
+
|
14
|
+
headers = headers.dup
|
15
|
+
headers.transform_keys!(&:to_s)
|
16
|
+
|
17
|
+
@headers = headers
|
18
|
+
end
|
19
|
+
|
20
|
+
# @return [String] the body of the response
|
21
|
+
attr_reader :body
|
22
|
+
|
23
|
+
# @return [Hash<String, Object>] the headers of the response
|
24
|
+
attr_reader :headers
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|