html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # HtmlNavigator provides methods to navigate through HTML nodes.
6
+ class HtmlNavigator
7
+ class << self
8
+ ##
9
+ # Returns the first parent that satisfies the condition.
10
+ # If the condition is met, it returns the node itself.
11
+ #
12
+ # @param node [Nokogiri::XML::Node] The node to start the search from.
13
+ # @param condition [Proc] The condition to be met.
14
+ # @return [Nokogiri::XML::Node, nil] The first parent that satisfies the condition.
15
+ def parent_until_condition(node, condition)
16
+ while node && !node.document? && node.name != 'html'
17
+ return node if condition.call(node)
18
+
19
+ node = node.parent
20
+ end
21
+ end
22
+
23
+ ##
24
+ # Think of it as `css_upwards` method.
25
+ # It searches for the closest parent that matches the given selector.
26
+ #
27
+ # @param current_tag [Nokogiri::XML::Node, nil] starting node
28
+ # @param selector [String] CSS selector to search upwards for
29
+ # @return [Nokogiri::XML::Node, nil] first matching node in upward traversal
30
+ def find_closest_selector_upwards(current_tag, selector)
31
+ while current_tag
32
+ found = current_tag.at_css(selector)
33
+ return found if found
34
+
35
+ return nil unless current_tag.respond_to?(:parent)
36
+
37
+ current_tag = current_tag.parent
38
+ end
39
+ end
40
+
41
+ ##
42
+ # Searches for the closest parent that matches the given tag name.
43
+ #
44
+ # @param current_tag [Nokogiri::XML::Node] starting node
45
+ # @param tag_name [String] tag name to find in ancestors
46
+ # @return [Nokogiri::XML::Node, nil] matching ancestor node
47
+ def find_tag_in_ancestors(current_tag, tag_name)
48
+ return current_tag if current_tag.name == tag_name
49
+
50
+ current_tag.ancestors(tag_name).first
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class JsonFeedBuilder
5
+ ##
6
+ # Maps an {Html2rss::RssBuilder::Article} to a JSONFeed 1.1 item hash.
7
+ class Item
8
+ ##
9
+ # @param article [Html2rss::RssBuilder::Article]
10
+ def initialize(article)
11
+ @article = article
12
+ end
13
+
14
+ ##
15
+ # @return [Hash, nil] the JSONFeed-compliant item hash
16
+ def to_h
17
+ content = content_fields
18
+ return if content.empty?
19
+
20
+ item_payload.merge(content).compact
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :article
26
+
27
+ ##
28
+ # @return [Hash]
29
+ def item_payload
30
+ {
31
+ id: article.guid,
32
+ url: article.url&.to_s,
33
+ title: article.title,
34
+ image: article.image&.to_s,
35
+ date_published: article.published_at&.iso8601,
36
+ authors: author_array,
37
+ tags:,
38
+ attachments:
39
+ }
40
+ end
41
+
42
+ ##
43
+ # @return [Array<Hash>, nil]
44
+ def author_array
45
+ return unless (name = article.author)
46
+
47
+ [{ name: }]
48
+ end
49
+
50
+ ##
51
+ # JSON Feed items must include content_html or content_text.
52
+ # @return [Hash]
53
+ def content_fields
54
+ description = article.description
55
+ return { content_html: description } if description
56
+
57
+ title = article.title
58
+ return { content_text: title } if title
59
+
60
+ {}
61
+ end
62
+
63
+ ##
64
+ # @return [Array<String>, nil]
65
+ def tags
66
+ cats = article.categories
67
+ cats.empty? ? nil : cats
68
+ end
69
+
70
+ ##
71
+ # Maps enclosures to JSONFeed attachment objects.
72
+ # @return [Array<Hash>, nil]
73
+ def attachments
74
+ enclosures = article.enclosures
75
+ return nil if enclosures.empty?
76
+
77
+ enclosures.map { |enc| attachment_hash(enc) }
78
+ end
79
+
80
+ ##
81
+ # @param enclosure [Html2rss::RssBuilder::Article::Enclosure]
82
+ # @return [Hash]
83
+ def attachment_hash(enclosure)
84
+ size = enclosure.bits_length
85
+
86
+ {
87
+ url: enclosure.url.to_s,
88
+ mime_type: enclosure.type,
89
+ size_in_bytes: size&.positive? ? size : nil
90
+ }.compact
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Builds a JSONFeed 1.1 hash from channel metadata and articles.
6
+ #
7
+ # @see https://www.jsonfeed.org/version/1.1/
8
+ class JsonFeedBuilder
9
+ # Official JSON Feed 1.1 schema version URL.
10
+ VERSION_URL = 'https://jsonfeed.org/version/1.1'
11
+
12
+ ##
13
+ # @param channel [Html2rss::RssBuilder::Channel]
14
+ # @param articles [Array<Html2rss::RssBuilder::Article>]
15
+ def initialize(channel:, articles:)
16
+ @channel = channel
17
+ @articles = articles
18
+ end
19
+
20
+ ##
21
+ # Builds and returns the JSONFeed hash.
22
+ #
23
+ # @return [Hash] the JSONFeed-compliant hash
24
+ def call
25
+ base_payload.merge(authors: author_array, items: item_hashes).compact
26
+ end
27
+
28
+ private
29
+
30
+ attr_reader :channel, :articles
31
+
32
+ ##
33
+ # @return [Hash]
34
+ def base_payload
35
+ {
36
+ version: VERSION_URL,
37
+ title: channel.title,
38
+ home_page_url: channel.url.to_s,
39
+ description: channel.description,
40
+ language: channel.language,
41
+ icon: channel.image&.to_s
42
+ }
43
+ end
44
+
45
+ ##
46
+ # @return [Array<Hash>]
47
+ def item_hashes
48
+ articles.filter_map { |article| Item.new(article).to_h }
49
+ end
50
+
51
+ ##
52
+ # @return [Array<Hash>, nil]
53
+ def author_array
54
+ return unless (name = channel.author)
55
+
56
+ [{ name: }]
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <audio> tag from a URL and type.
8
+ class AudioRenderer
9
+ # @param url [String, Html2rss::Url] media URL for the audio source
10
+ # @param type [String] MIME type for the audio source
11
+ def initialize(url:, type:)
12
+ @url = url
13
+ @type = type
14
+ end
15
+
16
+ # @return [String] HTML audio snippet for article rendering
17
+ def to_html
18
+ [
19
+ '<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">',
20
+ %(<source src="#{escaped_url}" type="#{escaped_type}">),
21
+ '</audio>'
22
+ ].join
23
+ end
24
+
25
+ private
26
+
27
+ def escaped_url
28
+ CGI.escapeHTML(@url.to_s)
29
+ end
30
+
31
+ def escaped_type
32
+ CGI.escapeHTML(@type.to_s)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ module Rendering
5
+ # Builds a sanitized article description from the base text, title, and optional media.
6
+ #
7
+ # Combines media elements (images, audio, video, PDFs) with sanitized text content
8
+ # to create rich RSS descriptions that reveal more scraped information.
9
+ #
10
+ # @example Basic usage
11
+ # builder = DescriptionBuilder.new(
12
+ # base: "Article content",
13
+ # title: "Article Title",
14
+ # url: "https://example.com",
15
+ # enclosures: [enclosure_object],
16
+ # image: "https://example.com/image.jpg"
17
+ # )
18
+ # description = builder.call
19
+ class DescriptionBuilder
20
+ # Removes the specified pattern from the beginning of the text
21
+ # within a given range if the pattern occurs before the range's end.
22
+ #
23
+ # @param text [String]
24
+ # @param pattern [String]
25
+ # @param end_of_range [Integer] Optional, defaults to half the text length
26
+ # @return [String]
27
+ def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
28
+ return text unless text.is_a?(String) && pattern.is_a?(String)
29
+
30
+ index = text.index(pattern)
31
+ return text if index.nil? || index >= end_of_range
32
+
33
+ text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
34
+ end
35
+
36
+ # @param base [String] The base text content for the description
37
+ # @param title [String] The article title (used for alt text and title removal)
38
+ # @param url [String, Html2rss::Url] The article URL (used for sanitization)
39
+ # @param enclosures [Array<Html2rss::RssBuilder::Enclosure>, nil] Media enclosures
40
+ # @param image [String, Html2rss::Url, nil] Fallback image URL
41
+ def initialize(base:, title:, url:, enclosures:, image:)
42
+ @base = base.to_s
43
+ @title = title
44
+ @url = url
45
+ @enclosures = Array(enclosures)
46
+ @image = image
47
+ end
48
+
49
+ # Generates the complete description with media and sanitized text.
50
+ #
51
+ # @return [String, nil] The complete description or nil if empty
52
+ def call
53
+ fragments = []
54
+ fragments.concat(Array(rendered_media))
55
+ fragments << processed_base_description
56
+
57
+ result = fragments.compact.join("\n\n").strip
58
+ result.empty? ? nil : result
59
+ end
60
+
61
+ private
62
+
63
+ def rendered_media
64
+ rendered = render_enclosures
65
+ return rendered if rendered.any?
66
+ return render_fallback_image if @image
67
+
68
+ []
69
+ end
70
+
71
+ def render_enclosures
72
+ @enclosures.filter_map do |enclosure|
73
+ MediaRenderer.for(enclosure:, image: @image, title: @title)&.to_html
74
+ end
75
+ end
76
+
77
+ def render_fallback_image
78
+ [MediaRenderer.for(enclosure: nil, image: @image, title: @title)&.to_html]
79
+ end
80
+
81
+ def processed_base_description
82
+ text = self.class.remove_pattern_from_start(@base, @title)
83
+ Html2rss::Selectors::PostProcessors::SanitizeHtml.get(text, @url)
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <img> tag from a URL and title.
8
+ class ImageRenderer
9
+ # @param url [String, Html2rss::Url] image URL for the src attribute
10
+ # @param title [String, nil] title/alt text for the image
11
+ def initialize(url:, title:)
12
+ @url = url
13
+ @title = title
14
+ end
15
+
16
+ # @return [String] HTML image snippet for article rendering
17
+ def to_html
18
+ attributes = [
19
+ %(src="#{escaped_url}"),
20
+ %(alt="#{escaped_title}"),
21
+ %(title="#{escaped_title}"),
22
+ 'loading="lazy"',
23
+ 'referrerpolicy="no-referrer"',
24
+ 'decoding="async"',
25
+ 'crossorigin="anonymous"'
26
+ ]
27
+ "<img #{attributes.join(' ')}>"
28
+ end
29
+
30
+ private
31
+
32
+ def escaped_url
33
+ CGI.escapeHTML(@url.to_s)
34
+ end
35
+
36
+ def escaped_title
37
+ CGI.escapeHTML(@title.to_s)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ module Rendering
5
+ # Factory: picks the appropriate renderer for a given enclosure or fallback image.
6
+ class MediaRenderer
7
+ # @param enclosure [Html2rss::RssBuilder::Enclosure, nil]
8
+ # @param image [String, Html2rss::Url, nil] Fallback image URL
9
+ # @param title [String]
10
+ # @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
11
+ def self.for(enclosure:, image:, title:)
12
+ return ImageRenderer.new(url: image, title:) if enclosure.nil? && image
13
+ return nil unless enclosure
14
+
15
+ create_renderer_for_type(enclosure.type, url: enclosure.url, title:)
16
+ end
17
+
18
+ # @private
19
+ # @param type [String, nil] enclosure MIME type
20
+ # @param url [String, Html2rss::Url] enclosure URL
21
+ # @param title [String, nil] title used by image renderer
22
+ # @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
23
+ def self.create_renderer_for_type(type, url:, title:)
24
+ case type
25
+ when %r{^image/}
26
+ ImageRenderer.new(url:, title:)
27
+ when %r{^video/}
28
+ VideoRenderer.new(url:, type:)
29
+ when %r{^audio/}
30
+ AudioRenderer.new(url:, type:)
31
+ when 'application/pdf'
32
+ PdfRenderer.new(url:)
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <iframe> for PDF documents.
8
+ class PdfRenderer
9
+ # @param url [String, Html2rss::Url] PDF URL rendered in the iframe
10
+ def initialize(url:)
11
+ @url = url
12
+ end
13
+
14
+ # @return [String] HTML iframe snippet for PDF rendering
15
+ def to_html
16
+ attributes = [
17
+ %(src="#{escaped_url}"),
18
+ 'width="100%"',
19
+ 'height="75vh"',
20
+ 'sandbox=""',
21
+ 'referrerpolicy="no-referrer"',
22
+ 'loading="lazy"'
23
+ ]
24
+ "<iframe #{attributes.join(' ')}></iframe>"
25
+ end
26
+
27
+ private
28
+
29
+ def escaped_url
30
+ CGI.escapeHTML(@url.to_s)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <video> tag from a URL and type.
8
+ class VideoRenderer
9
+ # @param url [String, Html2rss::Url] media URL for the video source
10
+ # @param type [String] MIME type for the video source
11
+ def initialize(url:, type:)
12
+ @url = url
13
+ @type = type
14
+ end
15
+
16
+ # @return [String] HTML video snippet for article rendering
17
+ def to_html
18
+ [
19
+ '<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>',
20
+ %(<source src="#{escaped_url}" type="#{escaped_type}">),
21
+ '</video>'
22
+ ].join
23
+ end
24
+
25
+ private
26
+
27
+ def escaped_url
28
+ CGI.escapeHTML(@url.to_s)
29
+ end
30
+
31
+ def escaped_type
32
+ CGI.escapeHTML(@type.to_s)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ # Namespace for HTML rendering logic, used to generate rich content such as
5
+ # images, audio, video, or embedded documents for feed descriptions.
6
+ #
7
+ # @example
8
+ # Html2rss::Rendering::ImageRenderer.new(
9
+ # url: "https://example.com/image.jpg",
10
+ # title: "Example"
11
+ # ).to_html
12
+ #
13
+ # @example
14
+ # Html2rss::Rendering::MediaRenderer.for(
15
+ # enclosure: nil,
16
+ # image: "https://example.com/image.jpg",
17
+ # title: "Example"
18
+ # )
19
+ #
20
+ # @see Html2rss::Rendering::DescriptionBuilder
21
+ module Rendering
22
+ end
23
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Tracks runtime request controls together with whether each value was explicitly set.
6
+ class RequestControls
7
+ # Request-control keys accepted at the top level of feed config.
8
+ TOP_LEVEL_KEYS = %i[strategy].freeze
9
+ # Request-control keys accepted under the nested `request` config.
10
+ REQUEST_KEYS = %i[max_redirects max_requests].freeze
11
+
12
+ ##
13
+ # @param config [Hash{Symbol => Object}] raw config input
14
+ # @return [RequestControls] request controls extracted from the config hash
15
+ def self.from_config(config)
16
+ HashUtil.assert_symbol_keys!(config, context: 'config', deep: false)
17
+ HashUtil.assert_symbol_keys!(config[:request], context: 'config[:request]') if config[:request].is_a?(Hash)
18
+
19
+ new(
20
+ strategy: config[:strategy],
21
+ max_redirects: request_value_for(config, :max_redirects),
22
+ max_requests: request_value_for(config, :max_requests),
23
+ explicit_keys: explicit_keys_for(config)
24
+ )
25
+ end
26
+
27
+ def self.explicit_keys_for(config)
28
+ TOP_LEVEL_KEYS.filter { config.key?(_1) } +
29
+ REQUEST_KEYS.filter { request_key?(config, _1) }
30
+ end
31
+
32
+ def self.request_value_for(config, key)
33
+ request_config = config[:request]
34
+ return nil unless request_config.is_a?(Hash)
35
+
36
+ request_config[key]
37
+ end
38
+
39
+ def self.request_key?(config, key)
40
+ request_config = config[:request]
41
+ request_config.is_a?(Hash) && request_config.key?(key)
42
+ end
43
+
44
+ private_class_method :explicit_keys_for, :request_value_for, :request_key?
45
+
46
+ ##
47
+ # @param strategy [Symbol, nil] effective request strategy
48
+ # @param max_redirects [Integer, nil] effective redirect limit
49
+ # @param max_requests [Integer, nil] effective request budget
50
+ # @param explicit_keys [Array<Symbol>] controls explicitly supplied by the caller
51
+ def initialize(strategy: nil, max_redirects: nil, max_requests: nil, explicit_keys: [])
52
+ @strategy = strategy
53
+ @max_redirects = max_redirects
54
+ @max_requests = max_requests
55
+ @explicit_keys = explicit_keys.map(&:to_sym).uniq.freeze
56
+ freeze
57
+ end
58
+
59
+ ##
60
+ # @return [Symbol, nil] effective request strategy
61
+ attr_reader :strategy
62
+
63
+ ##
64
+ # @return [Integer, nil] effective redirect limit
65
+ attr_reader :max_redirects
66
+
67
+ ##
68
+ # @return [Integer, nil] effective request budget
69
+ attr_reader :max_requests
70
+
71
+ ##
72
+ # @param name [Symbol, String] request control name
73
+ # @return [Boolean] whether the control was explicitly supplied
74
+ def explicit?(name)
75
+ explicit_keys.include?(name.to_sym)
76
+ end
77
+
78
+ ##
79
+ # @param strategy [Symbol, nil] validated request strategy
80
+ # @param max_redirects [Integer, nil] validated redirect limit
81
+ # @param max_requests [Integer, nil] validated request budget
82
+ # @return [RequestControls] controls updated with validated effective values
83
+ def with_effective_values(strategy:, max_redirects:, max_requests:)
84
+ self.class.new(
85
+ strategy:,
86
+ max_redirects:,
87
+ max_requests:,
88
+ explicit_keys:
89
+ )
90
+ end
91
+
92
+ ##
93
+ # Applies only explicitly set controls to the provided config hash.
94
+ #
95
+ # @param config [Hash{Symbol => Object}] mutable config hash
96
+ # @return [Hash{Symbol => Object}] the same hash with explicit controls written
97
+ def apply_to(config)
98
+ config[:strategy] = strategy if explicit?(:strategy)
99
+ apply_request_value(config, :max_redirects, max_redirects)
100
+ apply_request_value(config, :max_requests, max_requests)
101
+ config
102
+ end
103
+
104
+ private
105
+
106
+ attr_reader :explicit_keys
107
+
108
+ def apply_request_value(config, key, value)
109
+ return unless explicit?(key)
110
+
111
+ ensure_request_config!(config)
112
+ config[:request][key] = value
113
+ end
114
+
115
+ def ensure_request_config!(config)
116
+ request_config = config[:request]
117
+ return config[:request] = {} if request_config.nil?
118
+ return if request_config.is_a?(Hash)
119
+
120
+ raise ArgumentError, 'request config must be a hash'
121
+ end
122
+ end
123
+ end