html2rss 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Builds feeds from validated config through request, extraction, and rendering stages.
6
+ class FeedPipeline
7
+ ##
8
+ # @param raw_config [Hash{Symbol => Object}] user-provided feed config
9
+ def initialize(raw_config)
10
+ @raw_config = raw_config
11
+ end
12
+
13
+ ##
14
+ # @return [RSS::Rss] generated RSS feed
15
+ def to_rss
16
+ run do |response:, config:, articles:|
17
+ channel = RssBuilder::Channel.new(response, overrides: config.channel)
18
+ RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
19
+ end
20
+ end
21
+
22
+ ##
23
+ # @return [Hash] generated JSONFeed 1.1 payload
24
+ def to_json_feed
25
+ run do |response:, config:, articles:|
26
+ channel = RssBuilder::Channel.new(response, overrides: config.channel)
27
+ JsonFeedBuilder.new(channel:, articles:).call
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :raw_config
34
+
35
+ def run
36
+ config = Config.from_hash(raw_config, params: raw_config[:params])
37
+ state = pipeline_state_for(config)
38
+ yield response: state.fetch(:response), config:, articles: state.fetch(:articles)
39
+ end
40
+
41
+ def pipeline_state_for(config)
42
+ if config.strategy == :auto
43
+ run_auto_pipeline(config)
44
+ else
45
+ run_pipeline_for_strategy(config, strategy: config.strategy)
46
+ end
47
+ end
48
+
49
+ def run_pipeline_for_strategy(config, strategy:, budget: nil)
50
+ request_session = request_session_for(config, strategy:, budget:)
51
+ response = request_session.fetch_initial_response
52
+ articles = deduplicated_articles(response:, config:, request_session:)
53
+ { response:, articles: }
54
+ end
55
+
56
+ def request_session_for(config, strategy:, budget: nil)
57
+ RequestSession.from_runtime_input(runtime_input_for(config, strategy:), budget:)
58
+ end
59
+
60
+ def runtime_input_for(config, strategy:)
61
+ RequestSession::RuntimeInput.new(
62
+ url: config.url,
63
+ headers: config.headers,
64
+ request: config.request,
65
+ strategy:,
66
+ request_policy: RequestSession::RuntimePolicy.from_config(config)
67
+ )
68
+ end
69
+
70
+ def deduplicated_articles(response:, config:, request_session:)
71
+ Articles::Deduplicator.new(
72
+ collect_articles(response:, config:, request_session:)
73
+ ).call
74
+ end
75
+
76
+ def run_auto_pipeline(config)
77
+ auto_fallback_for(config).call
78
+ end
79
+
80
+ def auto_fallback_for(config)
81
+ AutoFallback.new(
82
+ strategies: AutoFallback::CHAIN,
83
+ budget: auto_pipeline_budget(config),
84
+ session_for: lambda do |strategy:, budget:|
85
+ request_session_for(config, strategy:, budget:)
86
+ end,
87
+ articles_for: lambda do |response:, request_session:|
88
+ deduplicated_articles(response:, config:, request_session:)
89
+ end
90
+ )
91
+ end
92
+
93
+ def auto_pipeline_budget(config)
94
+ max_requests = RequestSession::RuntimePolicy.from_config(config).max_requests
95
+ RequestService::Budget.new(max_requests:)
96
+ end
97
+
98
+ def collect_articles(response:, config:, request_session:)
99
+ selector_articles(response:, config:, request_session:) +
100
+ auto_source_articles(response:, config:, request_session:)
101
+ end
102
+
103
+ def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
104
+ return [] unless (selectors = config.selectors)
105
+
106
+ page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
107
+ RequestSession::RelNextPager.new(
108
+ session: request_session,
109
+ initial_response: response,
110
+ max_pages:
111
+ ).to_a
112
+ else
113
+ [response]
114
+ end
115
+
116
+ page_responses.flat_map do |page_response|
117
+ Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
118
+ end
119
+ end
120
+
121
+ def auto_source_articles(response:, config:, request_session:)
122
+ return [] unless (auto_source = config.auto_source)
123
+
124
+ AutoSource.new(response, auto_source, request_session:).articles
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ # Shared helpers for hash normalization and structural operations.
5
+ module HashUtil
6
+ module_function
7
+
8
+ # Deeply duplicates nested arrays and hashes.
9
+ #
10
+ # @param object [Object] nested value from configuration or runtime state
11
+ # @return [Object] deep duplicated object
12
+ def deep_dup(object)
13
+ case object
14
+ in Hash
15
+ object.transform_values { deep_dup(_1) }
16
+ in Array
17
+ object.map { deep_dup(_1) }
18
+ else
19
+ object.dup rescue StandardError # rubocop:disable Style/RescueModifier
20
+ end
21
+ end
22
+
23
+ # Deeply merges nested hashes while replacing non-hash values from override.
24
+ #
25
+ # @param base [Hash] base hash
26
+ # @param override [Hash] override hash
27
+ # @return [Hash] merged hash
28
+ def deep_merge(base, override)
29
+ base.merge(override) do |_key, old_val, new_val|
30
+ case [old_val, new_val]
31
+ in [Hash, Hash]
32
+ deep_merge(old_val, new_val)
33
+ else
34
+ new_val
35
+ end
36
+ end
37
+ end
38
+
39
+ # Converts string-keyed hashes to symbol-keyed hashes recursively.
40
+ #
41
+ # @param object [Object] value to normalize
42
+ # @param context [String] error context
43
+ # @return [Object] normalized value
44
+ def deep_symbolize_keys(object, context: 'hash')
45
+ case object
46
+ in Hash
47
+ object.each_with_object({}) do |(k, v), memo|
48
+ memo[symbol_key(k, context:)] = deep_symbolize_keys(v, context:)
49
+ end
50
+ in Array
51
+ object.map { deep_symbolize_keys(_1, context:) }
52
+ else
53
+ object
54
+ end
55
+ end
56
+
57
+ # Validates that hash keys are symbols.
58
+ #
59
+ # @param value [Object] candidate hash container whose keys must be symbols
60
+ # @param context [String] error context
61
+ # @param deep [Boolean] whether nested hashes should also be validated
62
+ # @return [void]
63
+ def assert_symbol_keys!(value, context: 'hash', deep: true)
64
+ return unless value in Hash
65
+
66
+ unless value.each_key.all?(Symbol)
67
+ invalid_key = value.keys.find { _1.class != Symbol }
68
+ raise ArgumentError, "#{context} must use symbol keys (found #{invalid_key.inspect})"
69
+ end
70
+
71
+ value.each_value { assert_symbol_keys!(_1, context:, deep:) } if deep
72
+ end
73
+
74
+ # Validates that hash keys are strings.
75
+ #
76
+ # @param value [Object] candidate hash container whose keys must be strings
77
+ # @param context [String] error context
78
+ # @param deep [Boolean] whether nested hashes should also be validated
79
+ # @return [void]
80
+ def assert_string_keys!(value, context: 'hash', deep: true)
81
+ return unless value in Hash
82
+
83
+ unless value.each_key.all?(String)
84
+ invalid_key = value.keys.find { _1.class != String }
85
+ raise ArgumentError, "#{context} must use string keys (found #{invalid_key.inspect})"
86
+ end
87
+
88
+ value.each_value { assert_string_keys!(_1, context:, deep:) } if deep
89
+ end
90
+
91
+ def symbol_key(key, context:)
92
+ case key
93
+ in Symbol then key
94
+ in String then key.to_sym
95
+ else
96
+ raise ArgumentError, "#{context} must use string or symbol keys (found #{key.inspect})"
97
+ end
98
+ end
99
+ private_class_method :symbol_key
100
+ end
101
+ end
@@ -4,6 +4,7 @@ module Html2rss
4
4
  class HtmlExtractor
5
5
  # Extracts the earliest date from an article_tag.
6
6
  class DateExtractor
7
+ # @param article_tag [Nokogiri::XML::Element] article container node
7
8
  # @return [DateTime, nil]
8
9
  def self.call(article_tag)
9
10
  times = article_tag.css('[datetime]').filter_map do |tag|
@@ -5,6 +5,9 @@ module Html2rss
5
5
  ##
6
6
  # Extracts enclosures from HTML tags using various strategies.
7
7
  class EnclosureExtractor
8
+ # @param article_tag [Nokogiri::XML::Element] article container node
9
+ # @param base_url [String, Html2rss::Url] base URL for relative enclosure links
10
+ # @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
8
11
  def self.call(article_tag, base_url)
9
12
  [
10
13
  Extractors::Image,
@@ -16,10 +19,14 @@ module Html2rss
16
19
  end
17
20
  end
18
21
 
22
+ # Extraction strategies for enclosure-like media/link tags.
19
23
  module Extractors
20
24
  # Extracts image enclosures from HTML tags.
21
25
  # Finds all image sources and returns them in a format suitable for RSS.
22
26
  class Image
27
+ # @param article_tag [Nokogiri::XML::Element] article container node
28
+ # @param base_url [String, Html2rss::Url] base URL for relative image sources
29
+ # @return [Array<Hash{Symbol => Object}>] image enclosure hashes
23
30
  def self.call(article_tag, base_url:)
24
31
  article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
25
32
  src = img['src'].to_s
@@ -36,6 +43,9 @@ module Html2rss
36
43
 
37
44
  # Extracts media enclosures (video/audio) from HTML tags.
38
45
  class Media
46
+ # @param article_tag [Nokogiri::XML::Element] article container node
47
+ # @param base_url [String, Html2rss::Url] base URL for relative media sources
48
+ # @return [Array<Hash{Symbol => Object}>] media enclosure hashes
39
49
  def self.call(article_tag, base_url:)
40
50
  article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
41
51
  src = element['src'].to_s
@@ -51,6 +61,9 @@ module Html2rss
51
61
 
52
62
  # Extracts PDF enclosures from HTML tags.
53
63
  class Pdf
64
+ # @param article_tag [Nokogiri::XML::Element] article container node
65
+ # @param base_url [String, Html2rss::Url] base URL for relative PDF links
66
+ # @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
54
67
  def self.call(article_tag, base_url:)
55
68
  article_tag.css('a[href$=".pdf"]').filter_map do |link|
56
69
  href = link['href'].to_s
@@ -67,6 +80,9 @@ module Html2rss
67
80
 
68
81
  # Extracts iframe enclosures from HTML tags.
69
82
  class Iframe
83
+ # @param article_tag [Nokogiri::XML::Element] article container node
84
+ # @param base_url [String, Html2rss::Url] base URL for relative iframe links
85
+ # @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
70
86
  def self.call(article_tag, base_url:)
71
87
  article_tag.css('iframe[src]').filter_map do |iframe|
72
88
  src = iframe['src']
@@ -83,6 +99,9 @@ module Html2rss
83
99
 
84
100
  # Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
85
101
  class Archive
102
+ # @param article_tag [Nokogiri::XML::Element] article container node
103
+ # @param base_url [String, Html2rss::Url] base URL for relative archive links
104
+ # @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
86
105
  def self.call(article_tag, base_url:)
87
106
  article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
88
107
  href = link['href'].to_s
@@ -5,6 +5,9 @@ module Html2rss
5
5
  ##
6
6
  # Image is responsible for extracting image URLs the article_tag.
7
7
  class ImageExtractor
8
+ # @param article_tag [Nokogiri::XML::Element] article container node
9
+ # @param base_url [String, Html2rss::Url] base URL for relative image URLs
10
+ # @return [Html2rss::Url, nil] best candidate image URL
8
11
  def self.call(article_tag, base_url:)
9
12
  img_src = from_source(article_tag) ||
10
13
  from_img(article_tag) ||
@@ -13,6 +16,8 @@ module Html2rss
13
16
  Url.from_relative(img_src, base_url) if img_src
14
17
  end
15
18
 
19
+ # @param article_tag [Nokogiri::XML::Element] article container node
20
+ # @return [String, nil] src attribute from first matching image tag
16
21
  def self.from_img(article_tag)
17
22
  article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
18
23
  end
@@ -21,6 +26,8 @@ module Html2rss
21
26
  # Extracts the largest image source from the srcset attribute
22
27
  # of an img tag or a source tag inside a picture tag.
23
28
  #
29
+ # @param article_tag [Nokogiri::XML::Element] article container node
30
+ # @return [String, nil] largest srcset URL candidate
24
31
  # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
25
32
  # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
26
33
  # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
@@ -38,6 +45,8 @@ module Html2rss
38
45
  hash[hash.keys.max]
39
46
  end
40
47
 
48
+ # @param article_tag [Nokogiri::XML::Element] article container node
49
+ # @return [String, nil] best style-based background image URL
41
50
  def self.from_style(article_tag)
42
51
  article_tag.css('[style*="url"]')
43
52
  .filter_map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
@@ -5,10 +5,14 @@ module Html2rss
5
5
  # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
6
6
  # from an article_tag.
7
7
  class HtmlExtractor
8
+ # Tags ignored when extracting visible text content from article containers.
8
9
  INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
10
+ # Heading tags used to prioritize title extraction.
9
11
  HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
12
+ # Selector used to derive non-headline description nodes.
10
13
  NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
11
14
 
15
+ # Anchor selector used to identify the canonical article link element.
12
16
  MAIN_ANCHOR_SELECTOR = begin
13
17
  buf = +'a[href]:not([href=""])'
14
18
  %w[# javascript: mailto: tel: file:// sms: data:].each do |prefix|
@@ -56,6 +60,7 @@ module Html2rss
56
60
  @selected_anchor = selected_anchor
57
61
  end
58
62
 
63
+ # @return [Hash{Symbol => Object}] extracted article attributes
59
64
  def call
60
65
  {
61
66
  title: extract_title,
@@ -23,6 +23,10 @@ module Html2rss
23
23
  ##
24
24
  # Think of it as `css_upwards` method.
25
25
  # It searches for the closest parent that matches the given selector.
26
+ #
27
+ # @param current_tag [Nokogiri::XML::Node, nil] starting node
28
+ # @param selector [String] CSS selector to search upwards for
29
+ # @return [Nokogiri::XML::Node, nil] first matching node in upward traversal
26
30
  def find_closest_selector_upwards(current_tag, selector)
27
31
  while current_tag
28
32
  found = current_tag.at_css(selector)
@@ -36,6 +40,10 @@ module Html2rss
36
40
 
37
41
  ##
38
42
  # Searches for the closest parent that matches the given tag name.
43
+ #
44
+ # @param current_tag [Nokogiri::XML::Node] starting node
45
+ # @param tag_name [String] tag name to find in ancestors
46
+ # @return [Nokogiri::XML::Node, nil] matching ancestor node
39
47
  def find_tag_in_ancestors(current_tag, tag_name)
40
48
  return current_tag if current_tag.name == tag_name
41
49
 
@@ -6,6 +6,7 @@ module Html2rss
6
6
  #
7
7
  # @see https://www.jsonfeed.org/version/1.1/
8
8
  class JsonFeedBuilder
9
+ # Official JSON Feed 1.1 schema version URL.
9
10
  VERSION_URL = 'https://jsonfeed.org/version/1.1'
10
11
 
11
12
  ##
@@ -6,15 +6,20 @@ module Html2rss
6
6
  module Rendering
7
7
  # Renders an HTML <audio> tag from a URL and type.
8
8
  class AudioRenderer
9
+ # @param url [String, Html2rss::Url] media URL for the audio source
10
+ # @param type [String] MIME type for the audio source
9
11
  def initialize(url:, type:)
10
12
  @url = url
11
13
  @type = type
12
14
  end
13
15
 
16
+ # @return [String] HTML audio snippet for article rendering
14
17
  def to_html
15
- %(<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">
16
- <source src="#{escaped_url}" type="#{escaped_type}">
17
- </audio>)
18
+ [
19
+ '<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">',
20
+ %(<source src="#{escaped_url}" type="#{escaped_type}">),
21
+ '</audio>'
22
+ ].join
18
23
  end
19
24
 
20
25
  private
@@ -16,7 +16,6 @@ module Html2rss
16
16
  # image: "https://example.com/image.jpg"
17
17
  # )
18
18
  # description = builder.call
19
- #
20
19
  class DescriptionBuilder
21
20
  # Removes the specified pattern from the beginning of the text
22
21
  # within a given range if the pattern occurs before the range's end.
@@ -6,23 +6,33 @@ module Html2rss
6
6
  module Rendering
7
7
  # Renders an HTML <img> tag from a URL and title.
8
8
  class ImageRenderer
9
+ # @param url [String, Html2rss::Url] image URL for the src attribute
10
+ # @param title [String, nil] title/alt text for the image
9
11
  def initialize(url:, title:)
10
12
  @url = url
11
13
  @title = title
12
14
  end
13
15
 
16
+ # @return [String] HTML image snippet for article rendering
14
17
  def to_html
15
- %(<img src="#{@url}"
16
- alt="#{escaped_title}"
17
- title="#{escaped_title}"
18
- loading="lazy"
19
- referrerpolicy="no-referrer"
20
- decoding="async"
21
- crossorigin="anonymous">).delete("\n").gsub(/\s+/, ' ')
18
+ attributes = [
19
+ %(src="#{escaped_url}"),
20
+ %(alt="#{escaped_title}"),
21
+ %(title="#{escaped_title}"),
22
+ 'loading="lazy"',
23
+ 'referrerpolicy="no-referrer"',
24
+ 'decoding="async"',
25
+ 'crossorigin="anonymous"'
26
+ ]
27
+ "<img #{attributes.join(' ')}>"
22
28
  end
23
29
 
24
30
  private
25
31
 
32
+ def escaped_url
33
+ CGI.escapeHTML(@url.to_s)
34
+ end
35
+
26
36
  def escaped_title
27
37
  CGI.escapeHTML(@title.to_s)
28
38
  end
@@ -16,6 +16,10 @@ module Html2rss
16
16
  end
17
17
 
18
18
  # @private
19
+ # @param type [String, nil] enclosure MIME type
20
+ # @param url [String, Html2rss::Url] enclosure URL
21
+ # @param title [String, nil] title used by image renderer
22
+ # @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
19
23
  def self.create_renderer_for_type(type, url:, title:)
20
24
  case type
21
25
  when %r{^image/}
@@ -6,16 +6,22 @@ module Html2rss
6
6
  module Rendering
7
7
  # Renders an HTML <iframe> for PDF documents.
8
8
  class PdfRenderer
9
+ # @param url [String, Html2rss::Url] PDF URL rendered in the iframe
9
10
  def initialize(url:)
10
11
  @url = url
11
12
  end
12
13
 
14
+ # @return [String] HTML iframe snippet for PDF rendering
13
15
  def to_html
14
- %(<iframe src="#{escaped_url}" width="100%" height="75vh"
15
- sandbox=""
16
- referrerpolicy="no-referrer"
17
- loading="lazy">
18
- </iframe>)
16
+ attributes = [
17
+ %(src="#{escaped_url}"),
18
+ 'width="100%"',
19
+ 'height="75vh"',
20
+ 'sandbox=""',
21
+ 'referrerpolicy="no-referrer"',
22
+ 'loading="lazy"'
23
+ ]
24
+ "<iframe #{attributes.join(' ')}></iframe>"
19
25
  end
20
26
 
21
27
  private
@@ -6,15 +6,20 @@ module Html2rss
6
6
  module Rendering
7
7
  # Renders an HTML <video> tag from a URL and type.
8
8
  class VideoRenderer
9
+ # @param url [String, Html2rss::Url] media URL for the video source
10
+ # @param type [String] MIME type for the video source
9
11
  def initialize(url:, type:)
10
12
  @url = url
11
13
  @type = type
12
14
  end
13
15
 
16
+ # @return [String] HTML video snippet for article rendering
14
17
  def to_html
15
- %(<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>
16
- <source src="#{escaped_url}" type="#{escaped_type}">
17
- </video>)
18
+ [
19
+ '<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>',
20
+ %(<source src="#{escaped_url}" type="#{escaped_type}">),
21
+ '</video>'
22
+ ].join
18
23
  end
19
24
 
20
25
  private
@@ -5,8 +5,17 @@ module Html2rss
5
5
  # images, audio, video, or embedded documents for feed descriptions.
6
6
  #
7
7
  # @example
8
- # Html2rss::Rendering::ImageRenderer.new(...).to_html
9
- # Html2rss::Rendering::MediaRenderer.for(...)
8
+ # Html2rss::Rendering::ImageRenderer.new(
9
+ # url: "https://example.com/image.jpg",
10
+ # title: "Example"
11
+ # ).to_html
12
+ #
13
+ # @example
14
+ # Html2rss::Rendering::MediaRenderer.for(
15
+ # enclosure: nil,
16
+ # image: "https://example.com/image.jpg",
17
+ # title: "Example"
18
+ # )
10
19
  #
11
20
  # @see Html2rss::Rendering::DescriptionBuilder
12
21
  module Rendering
@@ -4,15 +4,20 @@ module Html2rss
4
4
  ##
5
5
  # Tracks runtime request controls together with whether each value was explicitly set.
6
6
  class RequestControls
7
+ # Request-control keys accepted at the top level of feed config.
7
8
  TOP_LEVEL_KEYS = %i[strategy].freeze
9
+ # Request-control keys accepted under the nested `request` config.
8
10
  REQUEST_KEYS = %i[max_redirects max_requests].freeze
9
11
 
10
12
  ##
11
- # @param config [Hash<Symbol, Object>, Hash<String, Object>] raw config input
13
+ # @param config [Hash{Symbol => Object}] raw config input
12
14
  # @return [RequestControls] request controls extracted from the config hash
13
15
  def self.from_config(config)
16
+ HashUtil.assert_symbol_keys!(config, context: 'config', deep: false)
17
+ HashUtil.assert_symbol_keys!(config[:request], context: 'config[:request]') if config[:request].is_a?(Hash)
18
+
14
19
  new(
15
- strategy: value_for(config, :strategy),
20
+ strategy: config[:strategy],
16
21
  max_redirects: request_value_for(config, :max_redirects),
17
22
  max_requests: request_value_for(config, :max_requests),
18
23
  explicit_keys: explicit_keys_for(config)
@@ -20,33 +25,23 @@ module Html2rss
20
25
  end
21
26
 
22
27
  def self.explicit_keys_for(config)
23
- TOP_LEVEL_KEYS.filter { top_level_key?(config, _1) } +
28
+ TOP_LEVEL_KEYS.filter { config.key?(_1) } +
24
29
  REQUEST_KEYS.filter { request_key?(config, _1) }
25
30
  end
26
31
 
27
- def self.value_for(config, key)
28
- return config[key] if config.key?(key)
29
- return config[key.to_s] if config.key?(key.to_s)
30
-
31
- nil
32
- end
33
-
34
32
  def self.request_value_for(config, key)
35
- request_config = value_for(config, :request)
33
+ request_config = config[:request]
36
34
  return nil unless request_config.is_a?(Hash)
37
35
 
38
- value_for(request_config, key)
39
- end
40
-
41
- def self.top_level_key?(config, key)
42
- config.key?(key) || config.key?(key.to_s)
36
+ request_config[key]
43
37
  end
44
38
 
45
39
  def self.request_key?(config, key)
46
- request_config = value_for(config, :request)
47
- request_config.is_a?(Hash) && top_level_key?(request_config, key)
40
+ request_config = config[:request]
41
+ request_config.is_a?(Hash) && request_config.key?(key)
48
42
  end
49
- private_class_method :explicit_keys_for, :request_value_for, :top_level_key?, :request_key?, :value_for
43
+
44
+ private_class_method :explicit_keys_for, :request_value_for, :request_key?
50
45
 
51
46
  ##
52
47
  # @param strategy [Symbol, nil] effective request strategy
@@ -97,8 +92,8 @@ module Html2rss
97
92
  ##
98
93
  # Applies only explicitly set controls to the provided config hash.
99
94
  #
100
- # @param config [Hash<Symbol, Object>] mutable config hash
101
- # @return [Hash<Symbol, Object>] the same hash with explicit controls written
95
+ # @param config [Hash{Symbol => Object}] mutable config hash
96
+ # @return [Hash{Symbol => Object}] the same hash with explicit controls written
102
97
  def apply_to(config)
103
98
  config[:strategy] = strategy if explicit?(:strategy)
104
99
  apply_request_value(config, :max_redirects, max_redirects)