html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# HtmlNavigator provides methods to navigate through HTML nodes.
|
|
6
|
+
class HtmlNavigator
|
|
7
|
+
class << self
|
|
8
|
+
##
|
|
9
|
+
# Returns the first parent that satisfies the condition.
|
|
10
|
+
# If the condition is met, it returns the node itself.
|
|
11
|
+
#
|
|
12
|
+
# @param node [Nokogiri::XML::Node] The node to start the search from.
|
|
13
|
+
# @param condition [Proc] The condition to be met.
|
|
14
|
+
# @return [Nokogiri::XML::Node, nil] The first parent that satisfies the condition.
|
|
15
|
+
def parent_until_condition(node, condition)
|
|
16
|
+
while node && !node.document? && node.name != 'html'
|
|
17
|
+
return node if condition.call(node)
|
|
18
|
+
|
|
19
|
+
node = node.parent
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
# Think of it as `css_upwards` method.
|
|
25
|
+
# It searches for the closest parent that matches the given selector.
|
|
26
|
+
#
|
|
27
|
+
# @param current_tag [Nokogiri::XML::Node, nil] starting node
|
|
28
|
+
# @param selector [String] CSS selector to search upwards for
|
|
29
|
+
# @return [Nokogiri::XML::Node, nil] first matching node in upward traversal
|
|
30
|
+
def find_closest_selector_upwards(current_tag, selector)
|
|
31
|
+
while current_tag
|
|
32
|
+
found = current_tag.at_css(selector)
|
|
33
|
+
return found if found
|
|
34
|
+
|
|
35
|
+
return nil unless current_tag.respond_to?(:parent)
|
|
36
|
+
|
|
37
|
+
current_tag = current_tag.parent
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
##
|
|
42
|
+
# Searches for the closest parent that matches the given tag name.
|
|
43
|
+
#
|
|
44
|
+
# @param current_tag [Nokogiri::XML::Node] starting node
|
|
45
|
+
# @param tag_name [String] tag name to find in ancestors
|
|
46
|
+
# @return [Nokogiri::XML::Node, nil] matching ancestor node
|
|
47
|
+
def find_tag_in_ancestors(current_tag, tag_name)
|
|
48
|
+
return current_tag if current_tag.name == tag_name
|
|
49
|
+
|
|
50
|
+
current_tag.ancestors(tag_name).first
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class JsonFeedBuilder
|
|
5
|
+
##
|
|
6
|
+
# Maps an {Html2rss::RssBuilder::Article} to a JSONFeed 1.1 item hash.
|
|
7
|
+
class Item
|
|
8
|
+
##
|
|
9
|
+
# @param article [Html2rss::RssBuilder::Article]
|
|
10
|
+
def initialize(article)
|
|
11
|
+
@article = article
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
##
|
|
15
|
+
# @return [Hash, nil] the JSONFeed-compliant item hash
|
|
16
|
+
def to_h
|
|
17
|
+
content = content_fields
|
|
18
|
+
return if content.empty?
|
|
19
|
+
|
|
20
|
+
item_payload.merge(content).compact
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
attr_reader :article
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# @return [Hash]
|
|
29
|
+
def item_payload
|
|
30
|
+
{
|
|
31
|
+
id: article.guid,
|
|
32
|
+
url: article.url&.to_s,
|
|
33
|
+
title: article.title,
|
|
34
|
+
image: article.image&.to_s,
|
|
35
|
+
date_published: article.published_at&.iso8601,
|
|
36
|
+
authors: author_array,
|
|
37
|
+
tags:,
|
|
38
|
+
attachments:
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# @return [Array<Hash>, nil]
|
|
44
|
+
def author_array
|
|
45
|
+
return unless (name = article.author)
|
|
46
|
+
|
|
47
|
+
[{ name: }]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
##
|
|
51
|
+
# JSON Feed items must include content_html or content_text.
|
|
52
|
+
# @return [Hash]
|
|
53
|
+
def content_fields
|
|
54
|
+
description = article.description
|
|
55
|
+
return { content_html: description } if description
|
|
56
|
+
|
|
57
|
+
title = article.title
|
|
58
|
+
return { content_text: title } if title
|
|
59
|
+
|
|
60
|
+
{}
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# @return [Array<String>, nil]
|
|
65
|
+
def tags
|
|
66
|
+
cats = article.categories
|
|
67
|
+
cats.empty? ? nil : cats
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
##
|
|
71
|
+
# Maps enclosures to JSONFeed attachment objects.
|
|
72
|
+
# @return [Array<Hash>, nil]
|
|
73
|
+
def attachments
|
|
74
|
+
enclosures = article.enclosures
|
|
75
|
+
return nil if enclosures.empty?
|
|
76
|
+
|
|
77
|
+
enclosures.map { |enc| attachment_hash(enc) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
##
|
|
81
|
+
# @param enclosure [Html2rss::RssBuilder::Article::Enclosure]
|
|
82
|
+
# @return [Hash]
|
|
83
|
+
def attachment_hash(enclosure)
|
|
84
|
+
size = enclosure.bits_length
|
|
85
|
+
|
|
86
|
+
{
|
|
87
|
+
url: enclosure.url.to_s,
|
|
88
|
+
mime_type: enclosure.type,
|
|
89
|
+
size_in_bytes: size&.positive? ? size : nil
|
|
90
|
+
}.compact
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Builds a JSONFeed 1.1 hash from channel metadata and articles.
|
|
6
|
+
#
|
|
7
|
+
# @see https://www.jsonfeed.org/version/1.1/
|
|
8
|
+
class JsonFeedBuilder
|
|
9
|
+
# Official JSON Feed 1.1 schema version URL.
|
|
10
|
+
VERSION_URL = 'https://jsonfeed.org/version/1.1'
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
# @param channel [Html2rss::RssBuilder::Channel]
|
|
14
|
+
# @param articles [Array<Html2rss::RssBuilder::Article>]
|
|
15
|
+
def initialize(channel:, articles:)
|
|
16
|
+
@channel = channel
|
|
17
|
+
@articles = articles
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
##
|
|
21
|
+
# Builds and returns the JSONFeed hash.
|
|
22
|
+
#
|
|
23
|
+
# @return [Hash] the JSONFeed-compliant hash
|
|
24
|
+
def call
|
|
25
|
+
base_payload.merge(authors: author_array, items: item_hashes).compact
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
attr_reader :channel, :articles
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# @return [Hash]
|
|
34
|
+
def base_payload
|
|
35
|
+
{
|
|
36
|
+
version: VERSION_URL,
|
|
37
|
+
title: channel.title,
|
|
38
|
+
home_page_url: channel.url.to_s,
|
|
39
|
+
description: channel.description,
|
|
40
|
+
language: channel.language,
|
|
41
|
+
icon: channel.image&.to_s
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
##
|
|
46
|
+
# @return [Array<Hash>]
|
|
47
|
+
def item_hashes
|
|
48
|
+
articles.filter_map { |article| Item.new(article).to_h }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
##
|
|
52
|
+
# @return [Array<Hash>, nil]
|
|
53
|
+
def author_array
|
|
54
|
+
return unless (name = channel.author)
|
|
55
|
+
|
|
56
|
+
[{ name: }]
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <audio> tag from a URL and type.
|
|
8
|
+
class AudioRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] media URL for the audio source
|
|
10
|
+
# @param type [String] MIME type for the audio source
|
|
11
|
+
def initialize(url:, type:)
|
|
12
|
+
@url = url
|
|
13
|
+
@type = type
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @return [String] HTML audio snippet for article rendering
|
|
17
|
+
def to_html
|
|
18
|
+
[
|
|
19
|
+
'<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">',
|
|
20
|
+
%(<source src="#{escaped_url}" type="#{escaped_type}">),
|
|
21
|
+
'</audio>'
|
|
22
|
+
].join
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def escaped_url
|
|
28
|
+
CGI.escapeHTML(@url.to_s)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def escaped_type
|
|
32
|
+
CGI.escapeHTML(@type.to_s)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module Rendering
|
|
5
|
+
# Builds a sanitized article description from the base text, title, and optional media.
|
|
6
|
+
#
|
|
7
|
+
# Combines media elements (images, audio, video, PDFs) with sanitized text content
|
|
8
|
+
# to create rich RSS descriptions that reveal more scraped information.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
11
|
+
# builder = DescriptionBuilder.new(
|
|
12
|
+
# base: "Article content",
|
|
13
|
+
# title: "Article Title",
|
|
14
|
+
# url: "https://example.com",
|
|
15
|
+
# enclosures: [enclosure_object],
|
|
16
|
+
# image: "https://example.com/image.jpg"
|
|
17
|
+
# )
|
|
18
|
+
# description = builder.call
|
|
19
|
+
class DescriptionBuilder
|
|
20
|
+
# Removes the specified pattern from the beginning of the text
|
|
21
|
+
# within a given range if the pattern occurs before the range's end.
|
|
22
|
+
#
|
|
23
|
+
# @param text [String]
|
|
24
|
+
# @param pattern [String]
|
|
25
|
+
# @param end_of_range [Integer] Optional, defaults to half the text length
|
|
26
|
+
# @return [String]
|
|
27
|
+
def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
|
|
28
|
+
return text unless text.is_a?(String) && pattern.is_a?(String)
|
|
29
|
+
|
|
30
|
+
index = text.index(pattern)
|
|
31
|
+
return text if index.nil? || index >= end_of_range
|
|
32
|
+
|
|
33
|
+
text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @param base [String] The base text content for the description
|
|
37
|
+
# @param title [String] The article title (used for alt text and title removal)
|
|
38
|
+
# @param url [String, Html2rss::Url] The article URL (used for sanitization)
|
|
39
|
+
# @param enclosures [Array<Html2rss::RssBuilder::Enclosure>, nil] Media enclosures
|
|
40
|
+
# @param image [String, Html2rss::Url, nil] Fallback image URL
|
|
41
|
+
def initialize(base:, title:, url:, enclosures:, image:)
|
|
42
|
+
@base = base.to_s
|
|
43
|
+
@title = title
|
|
44
|
+
@url = url
|
|
45
|
+
@enclosures = Array(enclosures)
|
|
46
|
+
@image = image
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Generates the complete description with media and sanitized text.
|
|
50
|
+
#
|
|
51
|
+
# @return [String, nil] The complete description or nil if empty
|
|
52
|
+
def call
|
|
53
|
+
fragments = []
|
|
54
|
+
fragments.concat(Array(rendered_media))
|
|
55
|
+
fragments << processed_base_description
|
|
56
|
+
|
|
57
|
+
result = fragments.compact.join("\n\n").strip
|
|
58
|
+
result.empty? ? nil : result
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def rendered_media
|
|
64
|
+
rendered = render_enclosures
|
|
65
|
+
return rendered if rendered.any?
|
|
66
|
+
return render_fallback_image if @image
|
|
67
|
+
|
|
68
|
+
[]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def render_enclosures
|
|
72
|
+
@enclosures.filter_map do |enclosure|
|
|
73
|
+
MediaRenderer.for(enclosure:, image: @image, title: @title)&.to_html
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def render_fallback_image
|
|
78
|
+
[MediaRenderer.for(enclosure: nil, image: @image, title: @title)&.to_html]
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def processed_base_description
|
|
82
|
+
text = self.class.remove_pattern_from_start(@base, @title)
|
|
83
|
+
Html2rss::Selectors::PostProcessors::SanitizeHtml.get(text, @url)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <img> tag from a URL and title.
|
|
8
|
+
class ImageRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] image URL for the src attribute
|
|
10
|
+
# @param title [String, nil] title/alt text for the image
|
|
11
|
+
def initialize(url:, title:)
|
|
12
|
+
@url = url
|
|
13
|
+
@title = title
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @return [String] HTML image snippet for article rendering
|
|
17
|
+
def to_html
|
|
18
|
+
attributes = [
|
|
19
|
+
%(src="#{escaped_url}"),
|
|
20
|
+
%(alt="#{escaped_title}"),
|
|
21
|
+
%(title="#{escaped_title}"),
|
|
22
|
+
'loading="lazy"',
|
|
23
|
+
'referrerpolicy="no-referrer"',
|
|
24
|
+
'decoding="async"',
|
|
25
|
+
'crossorigin="anonymous"'
|
|
26
|
+
]
|
|
27
|
+
"<img #{attributes.join(' ')}>"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def escaped_url
|
|
33
|
+
CGI.escapeHTML(@url.to_s)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def escaped_title
|
|
37
|
+
CGI.escapeHTML(@title.to_s)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module Rendering
|
|
5
|
+
# Factory: picks the appropriate renderer for a given enclosure or fallback image.
|
|
6
|
+
class MediaRenderer
|
|
7
|
+
# @param enclosure [Html2rss::RssBuilder::Enclosure, nil]
|
|
8
|
+
# @param image [String, Html2rss::Url, nil] Fallback image URL
|
|
9
|
+
# @param title [String]
|
|
10
|
+
# @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
|
|
11
|
+
def self.for(enclosure:, image:, title:)
|
|
12
|
+
return ImageRenderer.new(url: image, title:) if enclosure.nil? && image
|
|
13
|
+
return nil unless enclosure
|
|
14
|
+
|
|
15
|
+
create_renderer_for_type(enclosure.type, url: enclosure.url, title:)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @private
|
|
19
|
+
# @param type [String, nil] enclosure MIME type
|
|
20
|
+
# @param url [String, Html2rss::Url] enclosure URL
|
|
21
|
+
# @param title [String, nil] title used by image renderer
|
|
22
|
+
# @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
|
|
23
|
+
def self.create_renderer_for_type(type, url:, title:)
|
|
24
|
+
case type
|
|
25
|
+
when %r{^image/}
|
|
26
|
+
ImageRenderer.new(url:, title:)
|
|
27
|
+
when %r{^video/}
|
|
28
|
+
VideoRenderer.new(url:, type:)
|
|
29
|
+
when %r{^audio/}
|
|
30
|
+
AudioRenderer.new(url:, type:)
|
|
31
|
+
when 'application/pdf'
|
|
32
|
+
PdfRenderer.new(url:)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <iframe> for PDF documents.
|
|
8
|
+
class PdfRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] PDF URL rendered in the iframe
|
|
10
|
+
def initialize(url:)
|
|
11
|
+
@url = url
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @return [String] HTML iframe snippet for PDF rendering
|
|
15
|
+
def to_html
|
|
16
|
+
attributes = [
|
|
17
|
+
%(src="#{escaped_url}"),
|
|
18
|
+
'width="100%"',
|
|
19
|
+
'height="75vh"',
|
|
20
|
+
'sandbox=""',
|
|
21
|
+
'referrerpolicy="no-referrer"',
|
|
22
|
+
'loading="lazy"'
|
|
23
|
+
]
|
|
24
|
+
"<iframe #{attributes.join(' ')}></iframe>"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def escaped_url
|
|
30
|
+
CGI.escapeHTML(@url.to_s)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <video> tag from a URL and type.
|
|
8
|
+
class VideoRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] media URL for the video source
|
|
10
|
+
# @param type [String] MIME type for the video source
|
|
11
|
+
def initialize(url:, type:)
|
|
12
|
+
@url = url
|
|
13
|
+
@type = type
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @return [String] HTML video snippet for article rendering
|
|
17
|
+
def to_html
|
|
18
|
+
[
|
|
19
|
+
'<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>',
|
|
20
|
+
%(<source src="#{escaped_url}" type="#{escaped_type}">),
|
|
21
|
+
'</video>'
|
|
22
|
+
].join
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def escaped_url
|
|
28
|
+
CGI.escapeHTML(@url.to_s)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def escaped_type
|
|
32
|
+
CGI.escapeHTML(@type.to_s)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
# Namespace for HTML rendering logic, used to generate rich content such as
|
|
5
|
+
# images, audio, video, or embedded documents for feed descriptions.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# Html2rss::Rendering::ImageRenderer.new(
|
|
9
|
+
# url: "https://example.com/image.jpg",
|
|
10
|
+
# title: "Example"
|
|
11
|
+
# ).to_html
|
|
12
|
+
#
|
|
13
|
+
# @example
|
|
14
|
+
# Html2rss::Rendering::MediaRenderer.for(
|
|
15
|
+
# enclosure: nil,
|
|
16
|
+
# image: "https://example.com/image.jpg",
|
|
17
|
+
# title: "Example"
|
|
18
|
+
# )
|
|
19
|
+
#
|
|
20
|
+
# @see Html2rss::Rendering::DescriptionBuilder
|
|
21
|
+
module Rendering
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Tracks runtime request controls together with whether each value was explicitly set.
|
|
6
|
+
class RequestControls
|
|
7
|
+
# Request-control keys accepted at the top level of feed config.
|
|
8
|
+
TOP_LEVEL_KEYS = %i[strategy].freeze
|
|
9
|
+
# Request-control keys accepted under the nested `request` config.
|
|
10
|
+
REQUEST_KEYS = %i[max_redirects max_requests].freeze
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
# @param config [Hash{Symbol => Object}] raw config input
|
|
14
|
+
# @return [RequestControls] request controls extracted from the config hash
|
|
15
|
+
def self.from_config(config)
|
|
16
|
+
HashUtil.assert_symbol_keys!(config, context: 'config', deep: false)
|
|
17
|
+
HashUtil.assert_symbol_keys!(config[:request], context: 'config[:request]') if config[:request].is_a?(Hash)
|
|
18
|
+
|
|
19
|
+
new(
|
|
20
|
+
strategy: config[:strategy],
|
|
21
|
+
max_redirects: request_value_for(config, :max_redirects),
|
|
22
|
+
max_requests: request_value_for(config, :max_requests),
|
|
23
|
+
explicit_keys: explicit_keys_for(config)
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.explicit_keys_for(config)
|
|
28
|
+
TOP_LEVEL_KEYS.filter { config.key?(_1) } +
|
|
29
|
+
REQUEST_KEYS.filter { request_key?(config, _1) }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.request_value_for(config, key)
|
|
33
|
+
request_config = config[:request]
|
|
34
|
+
return nil unless request_config.is_a?(Hash)
|
|
35
|
+
|
|
36
|
+
request_config[key]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.request_key?(config, key)
|
|
40
|
+
request_config = config[:request]
|
|
41
|
+
request_config.is_a?(Hash) && request_config.key?(key)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private_class_method :explicit_keys_for, :request_value_for, :request_key?
|
|
45
|
+
|
|
46
|
+
##
|
|
47
|
+
# @param strategy [Symbol, nil] effective request strategy
|
|
48
|
+
# @param max_redirects [Integer, nil] effective redirect limit
|
|
49
|
+
# @param max_requests [Integer, nil] effective request budget
|
|
50
|
+
# @param explicit_keys [Array<Symbol>] controls explicitly supplied by the caller
|
|
51
|
+
def initialize(strategy: nil, max_redirects: nil, max_requests: nil, explicit_keys: [])
|
|
52
|
+
@strategy = strategy
|
|
53
|
+
@max_redirects = max_redirects
|
|
54
|
+
@max_requests = max_requests
|
|
55
|
+
@explicit_keys = explicit_keys.map(&:to_sym).uniq.freeze
|
|
56
|
+
freeze
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
##
|
|
60
|
+
# @return [Symbol, nil] effective request strategy
|
|
61
|
+
attr_reader :strategy
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# @return [Integer, nil] effective redirect limit
|
|
65
|
+
attr_reader :max_redirects
|
|
66
|
+
|
|
67
|
+
##
|
|
68
|
+
# @return [Integer, nil] effective request budget
|
|
69
|
+
attr_reader :max_requests
|
|
70
|
+
|
|
71
|
+
##
|
|
72
|
+
# @param name [Symbol, String] request control name
|
|
73
|
+
# @return [Boolean] whether the control was explicitly supplied
|
|
74
|
+
def explicit?(name)
|
|
75
|
+
explicit_keys.include?(name.to_sym)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
##
|
|
79
|
+
# @param strategy [Symbol, nil] validated request strategy
|
|
80
|
+
# @param max_redirects [Integer, nil] validated redirect limit
|
|
81
|
+
# @param max_requests [Integer, nil] validated request budget
|
|
82
|
+
# @return [RequestControls] controls updated with validated effective values
|
|
83
|
+
def with_effective_values(strategy:, max_redirects:, max_requests:)
|
|
84
|
+
self.class.new(
|
|
85
|
+
strategy:,
|
|
86
|
+
max_redirects:,
|
|
87
|
+
max_requests:,
|
|
88
|
+
explicit_keys:
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
##
|
|
93
|
+
# Applies only explicitly set controls to the provided config hash.
|
|
94
|
+
#
|
|
95
|
+
# @param config [Hash{Symbol => Object}] mutable config hash
|
|
96
|
+
# @return [Hash{Symbol => Object}] the same hash with explicit controls written
|
|
97
|
+
def apply_to(config)
|
|
98
|
+
config[:strategy] = strategy if explicit?(:strategy)
|
|
99
|
+
apply_request_value(config, :max_redirects, max_redirects)
|
|
100
|
+
apply_request_value(config, :max_requests, max_requests)
|
|
101
|
+
config
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
private
|
|
105
|
+
|
|
106
|
+
attr_reader :explicit_keys
|
|
107
|
+
|
|
108
|
+
def apply_request_value(config, key, value)
|
|
109
|
+
return unless explicit?(key)
|
|
110
|
+
|
|
111
|
+
ensure_request_config!(config)
|
|
112
|
+
config[:request][key] = value
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def ensure_request_config!(config)
|
|
116
|
+
request_config = config[:request]
|
|
117
|
+
return config[:request] = {} if request_config.nil?
|
|
118
|
+
return if request_config.is_a?(Hash)
|
|
119
|
+
|
|
120
|
+
raise ArgumentError, 'request config must be a hash'
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|