html2rss 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +112 -44
- data/html2rss.gemspec +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
- data/lib/html2rss/auto_source/article.rb +37 -5
- data/lib/html2rss/auto_source/channel.rb +21 -28
- data/lib/html2rss/auto_source/cleanup.rb +0 -16
- data/lib/html2rss/auto_source/rss_builder.rb +1 -1
- data/lib/html2rss/auto_source/scraper/html.rb +21 -12
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +22 -34
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +41 -41
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
- data/lib/html2rss/auto_source.rb +0 -7
- data/lib/html2rss/cli.rb +11 -4
- data/lib/html2rss/config/channel.rb +7 -1
- data/lib/html2rss/config/selectors.rb +2 -1
- data/lib/html2rss/config.rb +1 -0
- data/lib/html2rss/item.rb +7 -2
- data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
- data/lib/html2rss/request_service/context.rb +46 -0
- data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
- data/lib/html2rss/request_service/puppet_commander.rb +61 -0
- data/lib/html2rss/request_service/response.rb +27 -0
- data/lib/html2rss/request_service/strategy.rb +28 -0
- data/lib/html2rss/request_service.rb +97 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
- data/lib/html2rss/utils.rb +23 -26
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +5 -5
- metadata +31 -11
- data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
module Scraper
|
8
|
+
class Schema
|
9
|
+
##
|
10
|
+
# A Thing is kind of the 'base class' for Schema.org schema_objects.
|
11
|
+
#
|
12
|
+
# @see https://schema.org/Thing
|
13
|
+
class Thing
|
14
|
+
SUPPORTED_TYPES = %w[
|
15
|
+
AdvertiserContentArticle
|
16
|
+
AnalysisNewsArticle
|
17
|
+
APIReference
|
18
|
+
Article
|
19
|
+
AskPublicNewsArticle
|
20
|
+
BackgroundNewsArticle
|
21
|
+
BlogPosting
|
22
|
+
DiscussionForumPosting
|
23
|
+
LiveBlogPosting
|
24
|
+
NewsArticle
|
25
|
+
OpinionNewsArticle
|
26
|
+
Report
|
27
|
+
ReportageNewsArticle
|
28
|
+
ReviewNewsArticle
|
29
|
+
SatiricalArticle
|
30
|
+
ScholarlyArticle
|
31
|
+
SocialMediaPosting
|
32
|
+
TechArticle
|
33
|
+
].to_set.freeze
|
34
|
+
|
35
|
+
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
36
|
+
|
37
|
+
def initialize(schema_object, url:)
|
38
|
+
@schema_object = schema_object
|
39
|
+
@url = url
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
43
|
+
def call
|
44
|
+
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
45
|
+
[attribute, public_send(attribute)]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
return @id if defined?(@id)
|
51
|
+
|
52
|
+
id = (schema_object[:@id] || url&.path).to_s
|
53
|
+
|
54
|
+
return if id.empty?
|
55
|
+
|
56
|
+
@id = id
|
57
|
+
end
|
58
|
+
|
59
|
+
def title = schema_object[:title]
|
60
|
+
|
61
|
+
def description
|
62
|
+
schema_object.values_at(:description, :schema_object_body, :abstract)
|
63
|
+
.max_by { |string| string.to_s.size }
|
64
|
+
end
|
65
|
+
|
66
|
+
# @return [Addressable::URI, nil] the URL of the schema object
|
67
|
+
def url
|
68
|
+
url = schema_object[:url]
|
69
|
+
if url.to_s.empty?
|
70
|
+
Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
|
71
|
+
return
|
72
|
+
end
|
73
|
+
|
74
|
+
Utils.build_absolute_url_from_relative(url, @url)
|
75
|
+
end
|
76
|
+
|
77
|
+
def image
|
78
|
+
if (image_url = image_urls.first)
|
79
|
+
Utils.build_absolute_url_from_relative(image_url, @url)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def published_at = schema_object[:datePublished]
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
attr_reader :schema_object
|
88
|
+
|
89
|
+
def image_urls
|
90
|
+
schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
|
91
|
+
next unless object
|
92
|
+
|
93
|
+
if object.is_a?(String)
|
94
|
+
object
|
95
|
+
elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
|
96
|
+
object[:url] || object[:contentUrl]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -2,58 +2,38 @@
|
|
2
2
|
|
3
3
|
require 'json'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'set'
|
6
5
|
|
7
6
|
module Html2rss
|
8
7
|
class AutoSource
|
9
8
|
module Scraper
|
10
9
|
##
|
11
|
-
#
|
10
|
+
# Scrapes articles from Schema.org objects, by looking for the objects in:
|
12
11
|
|
13
|
-
#
|
14
|
-
# 2. tbd
|
12
|
+
# <script type="application/ld+json"> "schema" tags.
|
15
13
|
#
|
16
14
|
# See:
|
17
|
-
# 1. https://schema.org/
|
15
|
+
# 1. https://schema.org/docs/full.html
|
18
16
|
# 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
19
17
|
class Schema
|
20
18
|
include Enumerable
|
21
19
|
|
22
20
|
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
23
|
-
SCHEMA_OBJECT_TYPES = %w[
|
24
|
-
AdvertiserContentArticle
|
25
|
-
AnalysisNewsArticle
|
26
|
-
APIReference
|
27
|
-
Article
|
28
|
-
AskPublicNewsArticle
|
29
|
-
BackgroundNewsArticle
|
30
|
-
BlogPosting
|
31
|
-
DiscussionForumPosting
|
32
|
-
LiveBlogPosting
|
33
|
-
NewsArticle
|
34
|
-
OpinionNewsArticle
|
35
|
-
Report
|
36
|
-
ReportageNewsArticle
|
37
|
-
ReviewNewsArticle
|
38
|
-
SatiricalArticle
|
39
|
-
ScholarlyArticle
|
40
|
-
SocialMediaPosting
|
41
|
-
TechArticle
|
42
|
-
].to_set.freeze
|
43
21
|
|
44
22
|
class << self
|
45
23
|
def articles?(parsed_body)
|
46
24
|
parsed_body.css(TAG_SELECTOR).any? do |script|
|
47
|
-
|
25
|
+
(Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
|
26
|
+
script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
|
27
|
+
end
|
48
28
|
end
|
49
29
|
end
|
50
30
|
|
51
31
|
##
|
52
32
|
# Returns a flat array
|
53
33
|
# of all supported schema objects
|
54
|
-
# by recursively traversing the `
|
34
|
+
# by recursively traversing the given `object`.
|
55
35
|
#
|
56
|
-
# @param object [Hash, Array]
|
36
|
+
# @param object [Hash, Array, Nokogiri::XML::Element]
|
57
37
|
# @return [Array<Hash>] the schema_objects, or an empty array
|
58
38
|
# :reek:DuplicateMethodCall
|
59
39
|
def from(object)
|
@@ -74,12 +54,16 @@ module Html2rss
|
|
74
54
|
end
|
75
55
|
|
76
56
|
##
|
77
|
-
# @return [Scraper::Schema::
|
57
|
+
# @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
|
78
58
|
def scraper_for_schema_object(schema_object)
|
79
|
-
|
80
|
-
|
59
|
+
type = schema_object[:@type]
|
60
|
+
|
61
|
+
if Thing::SUPPORTED_TYPES.member?(type)
|
62
|
+
Thing
|
63
|
+
elsif ItemList::SUPPORTED_TYPES.member?(type)
|
64
|
+
ItemList
|
81
65
|
else
|
82
|
-
Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{
|
66
|
+
Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
|
83
67
|
nil
|
84
68
|
end
|
85
69
|
end
|
@@ -107,9 +91,13 @@ module Html2rss
|
|
107
91
|
|
108
92
|
schema_objects.filter_map do |schema_object|
|
109
93
|
next unless (klass = self.class.scraper_for_schema_object(schema_object))
|
110
|
-
next unless (
|
94
|
+
next unless (results = klass.new(schema_object, url:).call)
|
111
95
|
|
112
|
-
|
96
|
+
if results.is_a?(Array)
|
97
|
+
results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
|
98
|
+
else
|
99
|
+
yield(results)
|
100
|
+
end
|
113
101
|
end
|
114
102
|
end
|
115
103
|
|
@@ -1,7 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'set'
|
4
|
-
|
5
3
|
module Html2rss
|
6
4
|
class AutoSource
|
7
5
|
module Scraper
|
@@ -33,6 +31,8 @@ module Html2rss
|
|
33
31
|
end
|
34
32
|
|
35
33
|
def initialize(article_tag, url:)
|
34
|
+
raise ArgumentError, 'article_tag is required' unless article_tag
|
35
|
+
|
36
36
|
@article_tag = article_tag
|
37
37
|
@url = url
|
38
38
|
end
|
@@ -57,20 +57,6 @@ module Html2rss
|
|
57
57
|
|
58
58
|
attr_reader :article_tag, :url, :heading, :extract_url
|
59
59
|
|
60
|
-
def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
|
61
|
-
|
62
|
-
# @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
|
63
|
-
def extract_published_at
|
64
|
-
times = article_tag.css('time[datetime]')
|
65
|
-
.filter_map do |tag|
|
66
|
-
DateTime.parse(tag['datetime'])
|
67
|
-
rescue ArgumentError, TypeError
|
68
|
-
nil
|
69
|
-
end
|
70
|
-
|
71
|
-
times.min
|
72
|
-
end
|
73
|
-
|
74
60
|
##
|
75
61
|
# Find the heading of the article.
|
76
62
|
# @return [Nokogiri::XML::Node, nil]
|
@@ -80,18 +66,36 @@ module Html2rss
|
|
80
66
|
return if heading_tags.empty?
|
81
67
|
|
82
68
|
smallest_heading = heading_tags.keys.min
|
83
|
-
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
|
69
|
+
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size.to_i }
|
70
|
+
end
|
71
|
+
|
72
|
+
def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
|
73
|
+
|
74
|
+
def closest_anchor
|
75
|
+
SemanticHtml.find_closest_selector(heading || article_tag,
|
76
|
+
selector: 'a[href]:not([href=""])')
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_url
|
80
|
+
href = closest_anchor&.[]('href')
|
81
|
+
|
82
|
+
return if (parts = href.to_s.split('#')).empty?
|
83
|
+
|
84
|
+
Utils.build_absolute_url_from_relative(parts.first.strip, url)
|
84
85
|
end
|
85
86
|
|
86
87
|
def extract_title
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
88
|
+
if heading && (heading.children.empty? || heading.text)
|
89
|
+
visible_text_from_tag(heading)
|
90
|
+
else
|
91
|
+
visible_text_from_tag(article_tag.css(HEADING_TAGS.join(','))
|
92
|
+
.max_by { |tag| tag.text.size })
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def extract_image
|
98
|
+
Image.call(article_tag, url:)
|
95
99
|
end
|
96
100
|
|
97
101
|
def extract_description
|
@@ -101,26 +105,10 @@ module Html2rss
|
|
101
105
|
description = visible_text_from_tag(article_tag)
|
102
106
|
return nil unless description
|
103
107
|
|
104
|
-
title_text = extract_title
|
105
|
-
description.gsub!(title_text, '') if title_text
|
106
108
|
description.strip!
|
107
109
|
description.empty? ? nil : description
|
108
110
|
end
|
109
111
|
|
110
|
-
def closest_anchor
|
111
|
-
SemanticHtml.find_closest_selector(heading || article_tag,
|
112
|
-
selector: 'a[href]:not([href=""])')
|
113
|
-
end
|
114
|
-
|
115
|
-
def find_url
|
116
|
-
href = closest_anchor&.[]('href')&.split('#')&.first&.strip
|
117
|
-
Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
|
118
|
-
end
|
119
|
-
|
120
|
-
def extract_image
|
121
|
-
Image.call(article_tag, url:)
|
122
|
-
end
|
123
|
-
|
124
112
|
def generate_id
|
125
113
|
[
|
126
114
|
article_tag['id'],
|
@@ -129,6 +117,18 @@ module Html2rss
|
|
129
117
|
extract_url&.query
|
130
118
|
].compact.reject(&:empty?).first
|
131
119
|
end
|
120
|
+
|
121
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
|
122
|
+
def extract_published_at
|
123
|
+
times = article_tag.css('time[datetime]')
|
124
|
+
.filter_map do |tag|
|
125
|
+
DateTime.parse(tag['datetime'])
|
126
|
+
rescue ArgumentError, TypeError
|
127
|
+
nil
|
128
|
+
end
|
129
|
+
|
130
|
+
times.min
|
131
|
+
end
|
132
132
|
end
|
133
133
|
end
|
134
134
|
end
|
@@ -28,14 +28,14 @@ module Html2rss
|
|
28
28
|
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
|
29
29
|
def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
|
30
30
|
hash = article_tag.css('img[srcset], picture > source[srcset]')
|
31
|
-
.flat_map
|
32
|
-
|
33
|
-
|
34
|
-
next if url.nil? || url.start_with?('data:')
|
31
|
+
.flat_map do |source|
|
32
|
+
source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)/).map do |url, width|
|
33
|
+
next if url.nil? || url.start_with?('data:')
|
35
34
|
|
36
|
-
|
35
|
+
width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
|
37
36
|
|
38
|
-
|
37
|
+
[width_value, url.strip]
|
38
|
+
end
|
39
39
|
end.to_h
|
40
40
|
|
41
41
|
hash[hash.keys.max]
|
@@ -106,9 +106,10 @@ module Html2rss
|
|
106
106
|
SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
|
107
107
|
parsed_body.css(selector).each do |selected_tag|
|
108
108
|
article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
|
109
|
-
article_hash = Extractor.new(article_tag, url: @url).call
|
110
109
|
|
111
|
-
|
110
|
+
if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
|
111
|
+
yield article_hash
|
112
|
+
end
|
112
113
|
end
|
113
114
|
end
|
114
115
|
end
|
data/lib/html2rss/auto_source.rb
CHANGED
@@ -11,20 +11,13 @@ module Html2rss
|
|
11
11
|
# It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
|
12
12
|
# marking articles, e.g. schema, microdata, open graph, etc.
|
13
13
|
class AutoSource
|
14
|
-
class UnsupportedUrlScheme < Html2rss::Error; end
|
15
14
|
class NoArticlesFound < Html2rss::Error; end
|
16
15
|
|
17
|
-
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
|
-
|
19
16
|
##
|
20
17
|
# @param url [Addressable::URI] The URL to extract articles from.
|
21
18
|
# @param body [String] The body of the response.
|
22
19
|
# @param headers [Hash] The headers of the response.
|
23
20
|
def initialize(url, body:, headers: {})
|
24
|
-
raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
|
25
|
-
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
26
|
-
raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
27
|
-
|
28
21
|
@url = url
|
29
22
|
@body = body
|
30
23
|
@headers = headers
|
data/lib/html2rss/cli.rb
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
require_relative '../html2rss'
|
4
4
|
require 'thor'
|
5
|
-
require 'addressable'
|
6
5
|
|
7
6
|
##
|
8
7
|
# The Html2rss namespace / command line interface.
|
@@ -26,14 +25,22 @@ module Html2rss
|
|
26
25
|
def feed(yaml_file, *options)
|
27
26
|
raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
|
28
27
|
|
29
|
-
feed_name = options.shift
|
28
|
+
feed_name = options.shift unless options.first&.include?('=')
|
30
29
|
params = options.to_h { |opt| opt.split('=', 2) }
|
30
|
+
|
31
31
|
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
32
32
|
end
|
33
33
|
|
34
|
-
desc 'auto URL', '
|
34
|
+
desc 'auto URL', 'Automatically sources an RSS feed from the URL'
|
35
|
+
method_option :strategy,
|
36
|
+
type: :string,
|
37
|
+
desc: 'The strategy to request the URL',
|
38
|
+
enum: RequestService.strategy_names,
|
39
|
+
default: RequestService.default_strategy_name
|
35
40
|
def auto(url)
|
36
|
-
|
41
|
+
strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
|
42
|
+
|
43
|
+
puts Html2rss.auto_source(url, strategy:)
|
37
44
|
end
|
38
45
|
end
|
39
46
|
end
|
@@ -55,7 +55,7 @@ module Html2rss
|
|
55
55
|
##
|
56
56
|
# @return [String]
|
57
57
|
def title
|
58
|
-
config.fetch(:title) { Utils.
|
58
|
+
config.fetch(:title) { Utils.titleized_channel_url(url) }
|
59
59
|
end
|
60
60
|
|
61
61
|
##
|
@@ -88,6 +88,12 @@ module Html2rss
|
|
88
88
|
config.fetch(:json, false)
|
89
89
|
end
|
90
90
|
|
91
|
+
##
|
92
|
+
# @return [Symbol]
|
93
|
+
def strategy
|
94
|
+
config.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
|
95
|
+
end
|
96
|
+
|
91
97
|
private
|
92
98
|
|
93
99
|
# @return [Hash<Symbol, Object>]
|
@@ -8,7 +8,8 @@ module Html2rss
|
|
8
8
|
ITEMS_SELECTOR_NAME = :items
|
9
9
|
|
10
10
|
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
|
-
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static,
|
11
|
+
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
|
12
|
+
keyword_init: true)
|
12
13
|
|
13
14
|
# raised when an invalid selector name is used
|
14
15
|
class InvalidSelectorName < Html2rss::Error; end
|
data/lib/html2rss/config.rb
CHANGED
@@ -27,6 +27,7 @@ module Html2rss
|
|
27
27
|
def_delegator :@channel, :url, :link
|
28
28
|
def_delegator :@channel, :time_zone
|
29
29
|
def_delegator :@channel, :json?
|
30
|
+
def_delegator :@channel, :strategy
|
30
31
|
|
31
32
|
def_delegator :@selectors, :item_selector_names
|
32
33
|
def_delegator :@selectors, :selector?
|
data/lib/html2rss/item.rb
CHANGED
@@ -23,7 +23,9 @@ module Html2rss
|
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
26
|
-
|
26
|
+
ctx = RequestService::Context.new(url:, headers: config.headers)
|
27
|
+
|
28
|
+
body = RequestService.execute(ctx, strategy: config.strategy).body
|
27
29
|
body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
|
28
30
|
|
29
31
|
Nokogiri.HTML(body)
|
@@ -136,8 +138,11 @@ module Html2rss
|
|
136
138
|
|
137
139
|
raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
|
138
140
|
|
141
|
+
type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
|
142
|
+
Html2rss::Utils.guess_content_type_from_url(url)
|
143
|
+
|
139
144
|
Enclosure.new(
|
140
|
-
type
|
145
|
+
type:,
|
141
146
|
bits_length: 0,
|
142
147
|
url: url.to_s
|
143
148
|
)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'puppeteer'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class RequestService
|
7
|
+
##
|
8
|
+
# Browserless.io strategy to request websites.
|
9
|
+
#
|
10
|
+
# Provide the WebSocket URL and your API token via environment variables:
|
11
|
+
# - BROWSERLESS_IO_WEBSOCKET_URL
|
12
|
+
# - BROWSERLESS_IO_API_TOKEN
|
13
|
+
#
|
14
|
+
# To use this strategy, you need to have a Browserless.io account or run a
|
15
|
+
# local Browserless.io instance.
|
16
|
+
#
|
17
|
+
# @see https://www.browserless.io/
|
18
|
+
#
|
19
|
+
# To run a local Browserless.io instance, you can use the following Docker command:
|
20
|
+
#
|
21
|
+
# ```sh
|
22
|
+
# docker run \
|
23
|
+
# --rm \
|
24
|
+
# -p 3000:3000 \
|
25
|
+
# -e "CONCURRENT=10" \
|
26
|
+
# -e "TOKEN=6R0W53R135510" \
|
27
|
+
# ghcr.io/browserless/chromium
|
28
|
+
# ```
|
29
|
+
#
|
30
|
+
# When running locally, you can skip setting the environment variables, as above commands
|
31
|
+
# are aligned with the default values.
|
32
|
+
# @see https://github.com/browserless/browserless/pkgs/container/chromium
|
33
|
+
class BrowserlessStrategy < Strategy
|
34
|
+
# return [Response]
|
35
|
+
def execute
|
36
|
+
Puppeteer.connect(browser_ws_endpoint:) do |browser|
|
37
|
+
PuppetCommander.new(ctx, browser).call
|
38
|
+
ensure
|
39
|
+
browser.disconnect
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def browser_ws_endpoint
|
44
|
+
@browser_ws_endpoint ||= begin
|
45
|
+
api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
|
46
|
+
ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
|
47
|
+
|
48
|
+
"#{ws_url}?token=#{api_token}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable/uri'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class RequestService
|
7
|
+
##
|
8
|
+
# Holds information needed to send requests to websites.
|
9
|
+
# To be passed down to the RequestService's strategies.
|
10
|
+
class Context
|
11
|
+
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
12
|
+
|
13
|
+
##
|
14
|
+
# @param url [String, Addressable::URI] the URL to request
|
15
|
+
# @param headers [Hash] HTTP request headers
|
16
|
+
def initialize(url:, headers: {})
|
17
|
+
@url = Addressable::URI.parse(url)
|
18
|
+
assert_valid_url!
|
19
|
+
|
20
|
+
@headers = headers
|
21
|
+
end
|
22
|
+
|
23
|
+
# @return [Addressable::URI] the parsed URL
|
24
|
+
attr_reader :url
|
25
|
+
|
26
|
+
# @return [Hash] the HTTP request headers
|
27
|
+
attr_reader :headers
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# Validates the URL.
|
33
|
+
# @raise [InvalidUrl] if the URL is not valid
|
34
|
+
# @raise [UnsupportedUrlScheme] if the URL scheme is not supported
|
35
|
+
def assert_valid_url!
|
36
|
+
raise InvalidUrl, 'URL must be absolute' unless url.absolute?
|
37
|
+
raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
|
38
|
+
|
39
|
+
return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
40
|
+
|
41
|
+
raise UnsupportedUrlScheme,
|
42
|
+
"URL scheme '#{url.scheme}' is not supported"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday'
|
4
|
+
require 'faraday/follow_redirects'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class RequestService
|
8
|
+
##
|
9
|
+
# Strategy to use Faraday for the request.
|
10
|
+
# @see https://rubygems.org/gems/faraday
|
11
|
+
class FaradayStrategy < Strategy
|
12
|
+
# return [Response]
|
13
|
+
def execute
|
14
|
+
request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
|
15
|
+
faraday.use Faraday::FollowRedirects::Middleware
|
16
|
+
faraday.adapter Faraday.default_adapter
|
17
|
+
end
|
18
|
+
response = request.get
|
19
|
+
|
20
|
+
Response.new(body: response.body, headers: response.headers)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class RequestService
|
5
|
+
##
|
6
|
+
# Commands the Puppeteer Browser to the website and builds the Response.
|
7
|
+
class PuppetCommander
|
8
|
+
# @param ctx [Context]
|
9
|
+
# @param browser [Puppeteer::Browser]
|
10
|
+
# @param skip_request_resources [Set<String>] the resource types not to request
|
11
|
+
# @param referer [String] the referer to use for the request
|
12
|
+
def initialize(ctx,
|
13
|
+
browser,
|
14
|
+
skip_request_resources: %w[stylesheet image media font].to_set,
|
15
|
+
referer: [ctx.url.scheme, ctx.url.host].join('://'))
|
16
|
+
@ctx = ctx
|
17
|
+
@browser = browser
|
18
|
+
@skip_request_resources = skip_request_resources
|
19
|
+
@referer = referer
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Response]
|
23
|
+
def call
|
24
|
+
page = new_page
|
25
|
+
|
26
|
+
response = navigate_to_destination(page, ctx.url)
|
27
|
+
|
28
|
+
Response.new(body: body(page), headers: response.headers)
|
29
|
+
ensure
|
30
|
+
page&.close
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# @return [Puppeteer::Page]
|
35
|
+
# @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
|
36
|
+
def new_page
|
37
|
+
page = browser.new_page
|
38
|
+
page.extra_http_headers = ctx.headers
|
39
|
+
|
40
|
+
return page if skip_request_resources.empty?
|
41
|
+
|
42
|
+
page.request_interception = true
|
43
|
+
page.on('request') do |request|
|
44
|
+
skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
|
45
|
+
end
|
46
|
+
|
47
|
+
page
|
48
|
+
end
|
49
|
+
|
50
|
+
def navigate_to_destination(page, url)
|
51
|
+
page.goto(url, wait_until: 'networkidle0', referer:)
|
52
|
+
end
|
53
|
+
|
54
|
+
def body(page) = page.content
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
attr_reader :ctx, :browser, :skip_request_resources, :referer
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|