html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ ##
6
+ # Extracts enclosures from HTML tags using various strategies.
7
+ class EnclosureExtractor
8
+ def self.call(article_tag, base_url)
9
+ [
10
+ Extractors::Image,
11
+ Extractors::Media,
12
+ Extractors::Pdf,
13
+ Extractors::Iframe,
14
+ Extractors::Archive
15
+ ].flat_map { |strategy| strategy.call(article_tag, base_url:) }
16
+ end
17
+ end
18
+
19
+ module Extractors
20
+ # Extracts image enclosures from HTML tags.
21
+ # Finds all image sources and returns them in a format suitable for RSS.
22
+ class Image
23
+ def self.call(article_tag, base_url:)
24
+ article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
25
+ src = img['src'].to_s
26
+ next if src.empty?
27
+
28
+ abs_url = Url.from_relative(src, base_url)
29
+ {
30
+ url: abs_url,
31
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
32
+ }
33
+ end
34
+ end
35
+ end
36
+
37
+ # Extracts media enclosures (video/audio) from HTML tags.
38
+ class Media
39
+ def self.call(article_tag, base_url:)
40
+ article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
41
+ src = element['src'].to_s
42
+ next if src.empty?
43
+
44
+ {
45
+ url: Url.from_relative(src, base_url),
46
+ type: element['type']
47
+ }
48
+ end
49
+ end
50
+ end
51
+
52
+ # Extracts PDF enclosures from HTML tags.
53
+ class Pdf
54
+ def self.call(article_tag, base_url:)
55
+ article_tag.css('a[href$=".pdf"]').filter_map do |link|
56
+ href = link['href'].to_s
57
+ next if href.empty?
58
+
59
+ abs_url = Url.from_relative(href, base_url)
60
+ {
61
+ url: abs_url,
62
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
63
+ }
64
+ end
65
+ end
66
+ end
67
+
68
+ # Extracts iframe enclosures from HTML tags.
69
+ class Iframe
70
+ def self.call(article_tag, base_url:)
71
+ article_tag.css('iframe[src]').filter_map do |iframe|
72
+ src = iframe['src']
73
+ next if src.nil? || src.empty?
74
+
75
+ abs_url = Url.from_relative(src, base_url)
76
+ {
77
+ url: abs_url,
78
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
79
+ }
80
+ end
81
+ end
82
+ end
83
+
84
+ # Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
85
+ class Archive
86
+ def self.call(article_tag, base_url:)
87
+ article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
88
+ href = link['href'].to_s
89
+ next if href.empty?
90
+
91
+ abs_url = Url.from_relative(href, base_url)
92
+ {
93
+ url: abs_url,
94
+ type: 'application/zip'
95
+ }
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ ##
6
+ # Image is responsible for extracting image URLs the article_tag.
7
+ class ImageExtractor
8
+ def self.call(article_tag, base_url:)
9
+ img_src = from_source(article_tag) ||
10
+ from_img(article_tag) ||
11
+ from_style(article_tag)
12
+
13
+ Url.from_relative(img_src, base_url) if img_src
14
+ end
15
+
16
+ def self.from_img(article_tag)
17
+ article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
18
+ end
19
+
20
+ ##
21
+ # Extracts the largest image source from the srcset attribute
22
+ # of an img tag or a source tag inside a picture tag.
23
+ #
24
+ # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
25
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
26
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
27
+ def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
28
+ hash = article_tag.css('img[srcset], picture > source[srcset]').flat_map do |source|
29
+ source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)[\s,]?/).map do |url, width|
30
+ next if url.nil? || url.start_with?('data:')
31
+
32
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
33
+
34
+ [width_value, url.strip]
35
+ end
36
+ end.compact.to_h
37
+
38
+ hash[hash.keys.max]
39
+ end
40
+
41
+ def self.from_style(article_tag)
42
+ article_tag.css('[style*="url"]')
43
+ .filter_map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
44
+ .reject { |src| src.start_with?('data:') }
45
+ .max_by(&:size)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
6
+ # from an article_tag.
7
+ class HtmlExtractor
8
+ INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
9
+ HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
10
+ NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
11
+
12
+ MAIN_ANCHOR_SELECTOR = begin
13
+ buf = +'a[href]:not([href=""])'
14
+ %w[# javascript: mailto: tel: file:// sms: data:].each do |prefix|
15
+ buf << %[:not([href^="#{prefix}"])]
16
+ end
17
+ buf.freeze
18
+ end
19
+
20
+ class << self
21
+ ##
22
+ # Extracts visible text from a given node and its children.
23
+ #
24
+ # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
25
+ # @param separator [String] separator used to join text fragments (default is a space)
26
+ # @return [String, nil] the concatenated visible text, or nil if none is found
27
+ def extract_visible_text(tag, separator: ' ')
28
+ parts = tag.children.filter_map do |child|
29
+ next unless visible_child?(child)
30
+
31
+ raw_text = child.children.empty? ? child.text : extract_visible_text(child)
32
+ text = raw_text&.strip
33
+ text unless text.to_s.empty?
34
+ end
35
+
36
+ parts.join(separator).squeeze(' ').strip unless parts.empty?
37
+ end
38
+
39
+ private
40
+
41
+ def visible_child?(node)
42
+ !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
43
+ !(node.name == 'a' && node['href']&.start_with?('#'))
44
+ end
45
+ end
46
+
47
+ ##
48
+ # @param article_tag [Nokogiri::XML::Node] article-like container to extract from
49
+ # @param base_url [String, Html2rss::Url] base url used to resolve relative links
50
+ # @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
51
+ def initialize(article_tag, base_url:, selected_anchor:)
52
+ raise ArgumentError, 'article_tag is required' unless article_tag
53
+
54
+ @article_tag = article_tag
55
+ @base_url = base_url
56
+ @selected_anchor = selected_anchor
57
+ end
58
+
59
+ def call
60
+ {
61
+ title: extract_title,
62
+ url: extract_url,
63
+ image: extract_image,
64
+ description: extract_description,
65
+ id: generate_id,
66
+ published_at: extract_published_at,
67
+ enclosures: extract_enclosures,
68
+ categories: extract_categories
69
+ }
70
+ end
71
+
72
+ private
73
+
74
+ attr_reader :article_tag, :base_url, :selected_anchor
75
+
76
+ class << self
77
+ ##
78
+ # @param article_tag [Nokogiri::XML::Node] article-like container to search within
79
+ # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
80
+ def main_anchor_for(article_tag)
81
+ return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
82
+
83
+ article_tag.at_css(MAIN_ANCHOR_SELECTOR)
84
+ end
85
+ end
86
+
87
+ def extract_url
88
+ @extract_url ||= begin
89
+ href = selected_anchor&.[]('href').to_s
90
+
91
+ Url.from_relative(href.split('#').first.strip, base_url) unless href.empty?
92
+ end
93
+ end
94
+
95
+ def extract_title
96
+ title_source = heading || selected_anchor
97
+ self.class.extract_visible_text(title_source) if title_source
98
+ end
99
+
100
+ def heading
101
+ @heading ||= begin
102
+ heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
103
+ smallest_heading = heading_tags.keys.min
104
+ if smallest_heading
105
+ heading_tags[smallest_heading]&.max_by do |tag|
106
+ self.class.extract_visible_text(tag)&.size.to_i
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ def extract_description
113
+ text = self.class.extract_visible_text(article_tag.css(NON_HEADLINE_SELECTOR), separator: '<br>')
114
+ return text if text && !text.empty?
115
+
116
+ description = self.class.extract_visible_text(article_tag)
117
+ return nil if description.nil? || description.strip.empty?
118
+
119
+ description.strip
120
+ end
121
+
122
+ def generate_id
123
+ [
124
+ article_tag['id'],
125
+ article_tag.at_css('[id]')&.attr('id'),
126
+ extract_url&.path,
127
+ extract_url&.query
128
+ ].compact.reject(&:empty?).first
129
+ end
130
+
131
+ def extract_image = ImageExtractor.call(article_tag, base_url:)
132
+ def extract_published_at = DateExtractor.call(article_tag)
133
+ def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
134
+ def extract_categories = CategoryExtractor.call(article_tag)
135
+ end
136
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # HtmlNavigator provides methods to navigate through HTML nodes.
6
+ class HtmlNavigator
7
+ class << self
8
+ ##
9
+ # Returns the first parent that satisfies the condition.
10
+ # If the condition is met, it returns the node itself.
11
+ #
12
+ # @param node [Nokogiri::XML::Node] The node to start the search from.
13
+ # @param condition [Proc] The condition to be met.
14
+ # @return [Nokogiri::XML::Node, nil] The first parent that satisfies the condition.
15
+ def parent_until_condition(node, condition)
16
+ while node && !node.document? && node.name != 'html'
17
+ return node if condition.call(node)
18
+
19
+ node = node.parent
20
+ end
21
+ end
22
+
23
+ ##
24
+ # Think of it as `css_upwards` method.
25
+ # It searches for the closest parent that matches the given selector.
26
+ def find_closest_selector_upwards(current_tag, selector)
27
+ while current_tag
28
+ found = current_tag.at_css(selector)
29
+ return found if found
30
+
31
+ return nil unless current_tag.respond_to?(:parent)
32
+
33
+ current_tag = current_tag.parent
34
+ end
35
+ end
36
+
37
+ ##
38
+ # Searches for the closest parent that matches the given tag name.
39
+ def find_tag_in_ancestors(current_tag, tag_name)
40
+ return current_tag if current_tag.name == tag_name
41
+
42
+ current_tag.ancestors(tag_name).first
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class JsonFeedBuilder
5
+ ##
6
+ # Maps an {Html2rss::RssBuilder::Article} to a JSONFeed 1.1 item hash.
7
+ class Item
8
+ ##
9
+ # @param article [Html2rss::RssBuilder::Article]
10
+ def initialize(article)
11
+ @article = article
12
+ end
13
+
14
+ ##
15
+ # @return [Hash, nil] the JSONFeed-compliant item hash
16
+ def to_h
17
+ content = content_fields
18
+ return if content.empty?
19
+
20
+ item_payload.merge(content).compact
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :article
26
+
27
+ ##
28
+ # @return [Hash]
29
+ def item_payload
30
+ {
31
+ id: article.guid,
32
+ url: article.url&.to_s,
33
+ title: article.title,
34
+ image: article.image&.to_s,
35
+ date_published: article.published_at&.iso8601,
36
+ authors: author_array,
37
+ tags:,
38
+ attachments:
39
+ }
40
+ end
41
+
42
+ ##
43
+ # @return [Array<Hash>, nil]
44
+ def author_array
45
+ return unless (name = article.author)
46
+
47
+ [{ name: }]
48
+ end
49
+
50
+ ##
51
+ # JSON Feed items must include content_html or content_text.
52
+ # @return [Hash]
53
+ def content_fields
54
+ description = article.description
55
+ return { content_html: description } if description
56
+
57
+ title = article.title
58
+ return { content_text: title } if title
59
+
60
+ {}
61
+ end
62
+
63
+ ##
64
+ # @return [Array<String>, nil]
65
+ def tags
66
+ cats = article.categories
67
+ cats.empty? ? nil : cats
68
+ end
69
+
70
+ ##
71
+ # Maps enclosures to JSONFeed attachment objects.
72
+ # @return [Array<Hash>, nil]
73
+ def attachments
74
+ enclosures = article.enclosures
75
+ return nil if enclosures.empty?
76
+
77
+ enclosures.map { |enc| attachment_hash(enc) }
78
+ end
79
+
80
+ ##
81
+ # @param enclosure [Html2rss::RssBuilder::Article::Enclosure]
82
+ # @return [Hash]
83
+ def attachment_hash(enclosure)
84
+ size = enclosure.bits_length
85
+
86
+ {
87
+ url: enclosure.url.to_s,
88
+ mime_type: enclosure.type,
89
+ size_in_bytes: size&.positive? ? size : nil
90
+ }.compact
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Builds a JSONFeed 1.1 hash from channel metadata and articles.
6
+ #
7
+ # @see https://www.jsonfeed.org/version/1.1/
8
+ class JsonFeedBuilder
9
+ VERSION_URL = 'https://jsonfeed.org/version/1.1'
10
+
11
+ ##
12
+ # @param channel [Html2rss::RssBuilder::Channel]
13
+ # @param articles [Array<Html2rss::RssBuilder::Article>]
14
+ def initialize(channel:, articles:)
15
+ @channel = channel
16
+ @articles = articles
17
+ end
18
+
19
+ ##
20
+ # Builds and returns the JSONFeed hash.
21
+ #
22
+ # @return [Hash] the JSONFeed-compliant hash
23
+ def call
24
+ base_payload.merge(authors: author_array, items: item_hashes).compact
25
+ end
26
+
27
+ private
28
+
29
+ attr_reader :channel, :articles
30
+
31
+ ##
32
+ # @return [Hash]
33
+ def base_payload
34
+ {
35
+ version: VERSION_URL,
36
+ title: channel.title,
37
+ home_page_url: channel.url.to_s,
38
+ description: channel.description,
39
+ language: channel.language,
40
+ icon: channel.image&.to_s
41
+ }
42
+ end
43
+
44
+ ##
45
+ # @return [Array<Hash>]
46
+ def item_hashes
47
+ articles.filter_map { |article| Item.new(article).to_h }
48
+ end
49
+
50
+ ##
51
+ # @return [Array<Hash>, nil]
52
+ def author_array
53
+ return unless (name = channel.author)
54
+
55
+ [{ name: }]
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <audio> tag from a URL and type.
8
+ class AudioRenderer
9
+ def initialize(url:, type:)
10
+ @url = url
11
+ @type = type
12
+ end
13
+
14
+ def to_html
15
+ %(<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">
16
+ <source src="#{escaped_url}" type="#{escaped_type}">
17
+ </audio>)
18
+ end
19
+
20
+ private
21
+
22
+ def escaped_url
23
+ CGI.escapeHTML(@url.to_s)
24
+ end
25
+
26
+ def escaped_type
27
+ CGI.escapeHTML(@type.to_s)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ module Rendering
5
+ # Builds a sanitized article description from the base text, title, and optional media.
6
+ #
7
+ # Combines media elements (images, audio, video, PDFs) with sanitized text content
8
+ # to create rich RSS descriptions that reveal more scraped information.
9
+ #
10
+ # @example Basic usage
11
+ # builder = DescriptionBuilder.new(
12
+ # base: "Article content",
13
+ # title: "Article Title",
14
+ # url: "https://example.com",
15
+ # enclosures: [enclosure_object],
16
+ # image: "https://example.com/image.jpg"
17
+ # )
18
+ # description = builder.call
19
+ #
20
+ class DescriptionBuilder
21
+ # Removes the specified pattern from the beginning of the text
22
+ # within a given range if the pattern occurs before the range's end.
23
+ #
24
+ # @param text [String]
25
+ # @param pattern [String]
26
+ # @param end_of_range [Integer] Optional, defaults to half the text length
27
+ # @return [String]
28
+ def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
29
+ return text unless text.is_a?(String) && pattern.is_a?(String)
30
+
31
+ index = text.index(pattern)
32
+ return text if index.nil? || index >= end_of_range
33
+
34
+ text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
35
+ end
36
+
37
+ # @param base [String] The base text content for the description
38
+ # @param title [String] The article title (used for alt text and title removal)
39
+ # @param url [String, Html2rss::Url] The article URL (used for sanitization)
40
+ # @param enclosures [Array<Html2rss::RssBuilder::Enclosure>, nil] Media enclosures
41
+ # @param image [String, Html2rss::Url, nil] Fallback image URL
42
+ def initialize(base:, title:, url:, enclosures:, image:)
43
+ @base = base.to_s
44
+ @title = title
45
+ @url = url
46
+ @enclosures = Array(enclosures)
47
+ @image = image
48
+ end
49
+
50
+ # Generates the complete description with media and sanitized text.
51
+ #
52
+ # @return [String, nil] The complete description or nil if empty
53
+ def call
54
+ fragments = []
55
+ fragments.concat(Array(rendered_media))
56
+ fragments << processed_base_description
57
+
58
+ result = fragments.compact.join("\n\n").strip
59
+ result.empty? ? nil : result
60
+ end
61
+
62
+ private
63
+
64
+ def rendered_media
65
+ rendered = render_enclosures
66
+ return rendered if rendered.any?
67
+ return render_fallback_image if @image
68
+
69
+ []
70
+ end
71
+
72
+ def render_enclosures
73
+ @enclosures.filter_map do |enclosure|
74
+ MediaRenderer.for(enclosure:, image: @image, title: @title)&.to_html
75
+ end
76
+ end
77
+
78
+ def render_fallback_image
79
+ [MediaRenderer.for(enclosure: nil, image: @image, title: @title)&.to_html]
80
+ end
81
+
82
+ def processed_base_description
83
+ text = self.class.remove_pattern_from_start(@base, @title)
84
+ Html2rss::Selectors::PostProcessors::SanitizeHtml.get(text, @url)
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <img> tag from a URL and title.
8
+ class ImageRenderer
9
+ def initialize(url:, title:)
10
+ @url = url
11
+ @title = title
12
+ end
13
+
14
+ def to_html
15
+ %(<img src="#{@url}"
16
+ alt="#{escaped_title}"
17
+ title="#{escaped_title}"
18
+ loading="lazy"
19
+ referrerpolicy="no-referrer"
20
+ decoding="async"
21
+ crossorigin="anonymous">).delete("\n").gsub(/\s+/, ' ')
22
+ end
23
+
24
+ private
25
+
26
+ def escaped_title
27
+ CGI.escapeHTML(@title.to_s)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ module Rendering
5
+ # Factory: picks the appropriate renderer for a given enclosure or fallback image.
6
+ class MediaRenderer
7
+ # @param enclosure [Html2rss::RssBuilder::Enclosure, nil]
8
+ # @param image [String, Html2rss::Url, nil] Fallback image URL
9
+ # @param title [String]
10
+ # @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
11
+ def self.for(enclosure:, image:, title:)
12
+ return ImageRenderer.new(url: image, title:) if enclosure.nil? && image
13
+ return nil unless enclosure
14
+
15
+ create_renderer_for_type(enclosure.type, url: enclosure.url, title:)
16
+ end
17
+
18
+ # @private
19
+ def self.create_renderer_for_type(type, url:, title:)
20
+ case type
21
+ when %r{^image/}
22
+ ImageRenderer.new(url:, title:)
23
+ when %r{^video/}
24
+ VideoRenderer.new(url:, type:)
25
+ when %r{^audio/}
26
+ AudioRenderer.new(url:, type:)
27
+ when 'application/pdf'
28
+ PdfRenderer.new(url:)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <iframe> for PDF documents.
8
+ class PdfRenderer
9
+ def initialize(url:)
10
+ @url = url
11
+ end
12
+
13
+ def to_html
14
+ %(<iframe src="#{escaped_url}" width="100%" height="75vh"
15
+ sandbox=""
16
+ referrerpolicy="no-referrer"
17
+ loading="lazy">
18
+ </iframe>)
19
+ end
20
+
21
+ private
22
+
23
+ def escaped_url
24
+ CGI.escapeHTML(@url.to_s)
25
+ end
26
+ end
27
+ end
28
+ end