html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Shared anti-bot/interstitial signatures used by request and auto-source flows.
6
+ #
7
+ # This module centralizes signature matching so request-time guards and
8
+ # auto-source surface classification stay consistent.
9
+ module BlockedSurface
10
+ INTERSTITIAL_SIGNATURES = [
11
+ {
12
+ key: :cloudflare_interstitial,
13
+ min_matches: 2,
14
+ patterns: [
15
+ %r{<title>\s*just a moment\.\.\.\s*</title>}i,
16
+ /checking your browser before accessing/i,
17
+ /please (?:enable|turn on) javascript and cookies/i,
18
+ %r{cdn-cgi/challenge-platform}i,
19
+ /cloudflare ray id/i
20
+ ],
21
+ message: 'Blocked surface detected: Cloudflare anti-bot interstitial page. ' \
22
+ 'Retry with --strategy browserless, try a more specific public listing URL, ' \
23
+ 'or run from an environment that can complete anti-bot checks.'
24
+ }
25
+ ].freeze
26
+
27
+ ##
28
+ # Returns the first matching interstitial signature for the provided body.
29
+ #
30
+ # @param body [String, nil] response body candidate
31
+ # @return [Hash, nil] signature hash when matched, otherwise nil
32
+ def self.interstitial_signature_for(body)
33
+ normalized_body = normalize_body(body)
34
+ INTERSTITIAL_SIGNATURES.find { |signature| interstitial_signature_match?(normalized_body, signature) }
35
+ end
36
+
37
+ ##
38
+ # @param body [String, nil] response body candidate
39
+ # @return [Boolean] true when body matches a known interstitial signature
40
+ def self.interstitial?(body)
41
+ !interstitial_signature_for(body).nil?
42
+ end
43
+
44
+ def self.interstitial_signature_match?(body, signature)
45
+ min_matches = signature.fetch(:min_matches, 1)
46
+ matches = 0
47
+
48
+ signature.fetch(:patterns).each do |pattern|
49
+ matches += 1 if pattern.match?(body)
50
+ return true if matches >= min_matches
51
+ end
52
+
53
+ false
54
+ end
55
+ private_class_method :interstitial_signature_match?
56
+
57
+ def self.normalize_body(body)
58
+ body.to_s.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
59
+ rescue Encoding::CompatibilityError, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
60
+ body.to_s.force_encoding(Encoding::UTF_8).scrub
61
+ end
62
+ private_class_method :normalize_body
63
+ end
64
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # CategoryExtractor is responsible for extracting categories from HTML elements
6
+ # by looking for CSS class names containing common category-related terms.
7
+ class CategoryExtractor
8
+ # Common category-related terms to look for in class names
9
+ CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
10
+
11
+ # CSS selectors to find elements with category-related class names
12
+ CATEGORY_SELECTORS = CATEGORY_TERMS.map { |term| "[class*=\"#{term}\"]" }.freeze
13
+
14
+ # Regex pattern for matching category-related attribute names
15
+ CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
16
+
17
+ ##
18
+ # Extracts categories from the given article tag by looking for elements
19
+ # with class names containing common category-related terms.
20
+ #
21
+ # @param article_tag [Nokogiri::XML::Element] The article element to extract categories from
22
+ # @return [Array<String>] Array of category strings, empty if none found
23
+ def self.call(article_tag)
24
+ return [] unless article_tag
25
+
26
+ # Single optimized traversal that extracts all category types
27
+ extract_all_categories(article_tag)
28
+ .map(&:strip)
29
+ .reject(&:empty?)
30
+ end
31
+
32
+ ##
33
+ # Optimized single DOM traversal that extracts all category types.
34
+ #
35
+ # @param article_tag [Nokogiri::XML::Element] The article element
36
+ # @return [Set<String>] Set of category strings
37
+ def self.extract_all_categories(article_tag)
38
+ Set.new.tap do |categories|
39
+ article_tag.css('*').each do |element|
40
+ # Extract text categories from elements with category-related class names
41
+ categories.merge(extract_text_categories(element)) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
42
+
43
+ # Extract data categories from all elements
44
+ categories.merge(extract_element_data_categories(element))
45
+ end
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Extracts categories from data attributes of a single element.
51
+ #
52
+ # @param element [Nokogiri::XML::Element] The element to process
53
+ # @return [Set<String>] Set of category strings
54
+ def self.extract_element_data_categories(element)
55
+ Set.new.tap do |categories|
56
+ element.attributes.each_value do |attr|
57
+ next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
58
+
59
+ value = attr.value&.strip
60
+ categories.add(value) if value && !value.empty?
61
+ end
62
+ end
63
+ end
64
+
65
+ ##
66
+ # Extracts text-based categories from elements, splitting content into discrete values.
67
+ #
68
+ # @param element [Nokogiri::XML::Element] The element to process
69
+ # @return [Set<String>] Set of category strings
70
+ def self.extract_text_categories(element)
71
+ anchor_values = element.css('a').filter_map do |node|
72
+ HtmlExtractor.extract_visible_text(node)
73
+ end
74
+ return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
75
+
76
+ text = HtmlExtractor.extract_visible_text(element)
77
+ return Set.new unless text
78
+
79
+ Set.new(text.split(/\n+/).map(&:strip).reject(&:empty?))
80
+ end
81
+ end
82
+ end
data/lib/html2rss/cli.rb CHANGED
@@ -1,46 +1,193 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative '../html2rss'
3
+ require 'fileutils'
4
+ require 'json'
4
5
  require 'thor'
5
6
 
6
7
  ##
7
8
  # The Html2rss namespace / command line interface.
8
9
  module Html2rss
9
- Log = Logger.new($stderr)
10
-
11
10
  ##
12
11
  # The Html2rss command line interface.
13
- class CLI < Thor
12
+ class CLI < Thor # rubocop:disable Metrics/ClassLength
13
+ check_unknown_options!
14
+
14
15
  def self.exit_on_failure?
15
16
  true
16
17
  end
17
18
 
18
- desc 'feed YAML_FILE [FEED_NAME] [param=value ...]', 'Print RSS built from the YAML_FILE file to stdout'
19
- ##
20
- # Prints the feed to STDOUT.
21
- #
22
- # @param yaml_file [String] Path to the YAML configuration file.
23
- # @param options [Array<String>] Additional options including feed name and parameters.
24
- # @return [nil]
25
- def feed(yaml_file, *options)
26
- raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
27
-
28
- feed_name = options.shift unless options.first&.include?('=')
29
- params = options.to_h { |opt| opt.split('=', 2) }
19
+ desc 'feed YAML_FILE [feed_name]', 'Print RSS built from the YAML_FILE file to stdout'
20
+ method_option :params,
21
+ type: :hash,
22
+ optional: true,
23
+ required: false,
24
+ default: {}
25
+ method_option :strategy,
26
+ type: :string,
27
+ desc: 'The strategy to request the URL',
28
+ enum: %w[faraday browserless]
29
+ method_option :max_redirects,
30
+ type: :numeric,
31
+ desc: 'Maximum redirects to follow per request'
32
+ method_option :max_requests,
33
+ type: :numeric,
34
+ desc: 'Maximum requests to allow for this feed build'
35
+ def feed(yaml_file, feed_name = nil)
36
+ config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
37
+ config[:params] = options[:params] || {}
38
+ apply_runtime_request_overrides!(config)
30
39
 
31
- puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
40
+ puts(execute_feed { Html2rss.feed(config) })
32
41
  end
33
42
 
34
- desc 'auto URL', 'Automatically sources an RSS feed from the URL'
43
+ desc 'auto [URL]', 'Automatically sources an RSS feed from the URL'
35
44
  method_option :strategy,
36
45
  type: :string,
37
46
  desc: 'The strategy to request the URL',
38
- enum: RequestService.strategy_names,
39
- default: RequestService.default_strategy_name
40
- def auto(url)
41
- strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
47
+ enum: %w[faraday browserless]
48
+ method_option :format,
49
+ type: :string,
50
+ desc: 'Output format for the auto-sourced feed',
51
+ enum: %w[rss jsonfeed],
52
+ default: 'rss'
53
+ method_option :items_selector, type: :string, desc: 'CSS selector for items (will be enhanced) (optional)'
54
+ method_option :max_redirects,
55
+ type: :numeric,
56
+ desc: 'Maximum redirects to follow per request'
57
+ method_option :max_requests,
58
+ type: :numeric,
59
+ desc: 'Maximum requests to allow for this feed build'
60
+ def auto(url) # rubocop:disable Metrics/MethodLength
61
+ format = options.fetch(:format, 'rss')
62
+ source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
63
+
64
+ result = execute_feed do
65
+ source_method.call(
66
+ url,
67
+ strategy: current_strategy,
68
+ items_selector: options[:items_selector],
69
+ max_redirects: options[:max_redirects],
70
+ max_requests: options[:max_requests]
71
+ )
72
+ end
73
+
74
+ puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
75
+ end
76
+
77
+ desc 'schema', 'Print the exported config JSON Schema'
78
+ method_option :pretty,
79
+ type: :boolean,
80
+ desc: 'Pretty-print the schema JSON',
81
+ default: true
82
+ method_option :write,
83
+ type: :string,
84
+ desc: 'Write the schema JSON to the given file path'
85
+ ##
86
+ # Prints or writes the exported configuration JSON Schema.
87
+ #
88
+ # @return [void]
89
+ def schema
90
+ schema_json = Html2rss::Config.json_schema_json(pretty: options.fetch(:pretty, true))
91
+
92
+ if options[:write]
93
+ FileUtils.mkdir_p(File.dirname(options[:write]))
94
+ File.write(options[:write], "#{schema_json}\n")
95
+ puts options[:write]
96
+ return
97
+ end
98
+
99
+ puts schema_json
100
+ end
101
+
102
+ desc 'validate YAML_FILE [feed_name]', 'Validate a YAML config with the runtime validator'
103
+ method_option :params,
104
+ type: :hash,
105
+ optional: true,
106
+ required: false,
107
+ default: {}
108
+ ##
109
+ # Validates a YAML config and prints the result.
110
+ #
111
+ # @param yaml_file [String] the YAML file to validate
112
+ # @param feed_name [String, nil] optional feed name for multi-feed files
113
+ # @return [void]
114
+ def validate(yaml_file, feed_name = nil)
115
+ result = Html2rss::Config.validate_yaml(yaml_file, feed_name, params: options[:params] || {})
116
+
117
+ raise Thor::Error, "Invalid configuration: #{result.errors.to_h}" unless result.success?
118
+
119
+ puts 'Configuration is valid'
120
+ end
121
+
122
+ private
123
+
124
+ def apply_runtime_request_overrides!(config)
125
+ clear_blank_request_overrides!(config)
126
+ request_controls.apply_to(config)
127
+ end
128
+
129
+ def clear_blank_request_overrides!(config)
130
+ config.delete(:strategy) if config[:strategy].nil?
131
+
132
+ request_config = config[:request]
133
+ return unless request_config.is_a?(Hash)
134
+
135
+ %i[max_redirects max_requests].each do |key|
136
+ request_config.delete(key) if request_config[key].nil?
137
+ end
138
+ config.delete(:request) if request_config.empty?
139
+ end
140
+
141
+ def request_controls
142
+ Html2rss::RequestControls.new(
143
+ strategy: options[:strategy]&.to_sym,
144
+ max_redirects: options[:max_redirects],
145
+ max_requests: options[:max_requests],
146
+ explicit_keys: explicit_request_control_keys
147
+ )
148
+ end
149
+
150
+ def explicit_request_control_keys
151
+ keys = []
152
+ keys << :strategy if options[:strategy]
153
+ keys << :max_redirects unless options[:max_redirects].nil?
154
+ keys << :max_requests unless options[:max_requests].nil?
155
+ keys
156
+ end
157
+
158
+ def current_strategy
159
+ options[:strategy]&.to_sym || :faraday
160
+ end
161
+
162
+ def current_max_redirects
163
+ options.fetch(:max_redirects, Html2rss::RequestService::Policy::DEFAULTS[:max_redirects])
164
+ end
165
+
166
+ def current_max_requests
167
+ options.fetch(:max_requests, Html2rss::RequestService::Policy::DEFAULTS[:max_requests])
168
+ end
169
+
170
+ def suggested_max_redirects
171
+ current_max_redirects + 1
172
+ end
173
+
174
+ def suggested_max_requests
175
+ current_max_requests + 1
176
+ end
42
177
 
43
- puts Html2rss.auto_source(url, strategy:)
178
+ def execute_feed # rubocop:disable Metrics/MethodLength
179
+ yield
180
+ rescue Faraday::FollowRedirects::RedirectLimitReached => error
181
+ raise Thor::Error,
182
+ "#{error.message}. retry with --max-redirects #{suggested_max_redirects} or use the final URL directly."
183
+ rescue Html2rss::RequestService::RequestBudgetExceeded => error
184
+ raise Thor::Error,
185
+ "#{error.message}. retry with --max-requests #{suggested_max_requests} " \
186
+ 'or increase request.max_requests in the config.'
187
+ rescue Html2rss::RequestService::BrowserlessConfigurationError,
188
+ Html2rss::RequestService::BrowserlessConnectionFailed,
189
+ Html2rss::RequestService::BlockedSurfaceDetected => error
190
+ raise Thor::Error, error.message
44
191
  end
45
192
  end
46
193
  end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Public class-level helpers for loading, validating, and exporting config.
7
+ module ClassMethods
8
+ UNSET = Object.new.freeze
9
+
10
+ ##
11
+ # Returns the exported JSON Schema for html2rss configuration.
12
+ #
13
+ # @return [Hash<String, Object>] JSON Schema represented as a Ruby hash
14
+ def json_schema
15
+ Schema.json_schema
16
+ end
17
+
18
+ ##
19
+ # Returns the exported JSON Schema as JSON.
20
+ #
21
+ # @param pretty [Boolean] whether to pretty-print the JSON output
22
+ # @return [String] serialized JSON Schema
23
+ def json_schema_json(pretty: true)
24
+ pretty ? JSON.pretty_generate(json_schema) : JSON.generate(json_schema)
25
+ end
26
+
27
+ ##
28
+ # Validates a configuration hash with the runtime validator.
29
+ #
30
+ # @param config [Hash<Symbol, Object>] the configuration hash
31
+ # @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting
32
+ # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
33
+ def validate(config, params: UNSET)
34
+ prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
35
+
36
+ Validator.new.call(prepared_config)
37
+ rescue DynamicParams::ParamsMissing => error
38
+ prepared_config = prepare_for_validation(deep_dup(config))
39
+ prepared_config[:dynamic_params_error] = error.message
40
+
41
+ Validator.new.call(prepared_config)
42
+ end
43
+
44
+ ##
45
+ # Returns the packaged JSON Schema file path.
46
+ #
47
+ # @return [String] absolute path to the packaged JSON Schema file
48
+ def schema_path
49
+ Schema.path
50
+ end
51
+
52
+ ##
53
+ # Loads and validates a YAML configuration file.
54
+ #
55
+ # @param file [String] the YAML file to load
56
+ # @param feed_name [String, nil] optional feed name for multi-feed files
57
+ # @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
58
+ # @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting
59
+ # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
60
+ def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
61
+ validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
62
+ end
63
+
64
+ ##
65
+ # Loads the feed configuration from a YAML file.
66
+ #
67
+ # Supports multiple feeds defined under the specified key (default :feeds).
68
+ #
69
+ # @param file [String] the YAML file to load.
70
+ # @param feed_name [String, nil] the feed name when using multiple feeds.
71
+ # @param multiple_feeds_key [Symbol] the key under which multiple feeds are defined.
72
+ # @return [Hash<Symbol, Object>] the configuration hash.
73
+ # @raise [ArgumentError] if the file doesn't exist or feed is not found.
74
+ # rubocop:disable Metrics/MethodLength
75
+ def load_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS)
76
+ raise ArgumentError, "File '#{file}' does not exist" unless File.exist?(file)
77
+ raise ArgumentError, "`#{multiple_feeds_key}` is a reserved feed name" if feed_name == multiple_feeds_key
78
+
79
+ yaml = YAML.safe_load_file(file, symbolize_names: true)
80
+
81
+ return yaml unless yaml.key?(multiple_feeds_key)
82
+
83
+ unless feed_name
84
+ available_feeds = yaml.fetch(multiple_feeds_key).keys.join(', ')
85
+ raise ArgumentError,
86
+ "Feed name is required under `#{multiple_feeds_key}`. Available feeds: #{available_feeds}"
87
+ end
88
+
89
+ config = yaml.dig(multiple_feeds_key, feed_name.to_sym)
90
+ raise ArgumentError, "Feed '#{feed_name}' not found under `#{multiple_feeds_key}` key." unless config
91
+
92
+ MultipleFeedsConfig.to_single_feed(config, yaml, multiple_feeds_key:)
93
+ end
94
+ # rubocop:enable Metrics/MethodLength
95
+
96
+ ##
97
+ # Processes the provided configuration hash, applying dynamic parameters if given,
98
+ # and returns a new configuration object.
99
+ #
100
+ # @param config [Hash<Symbol, Object>] the configuration hash.
101
+ # @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting.
102
+ # @return [Html2rss::Config] the configuration object.
103
+ def from_hash(config, params: UNSET)
104
+ new(resolve_effective_config(config, params:))
105
+ end
106
+
107
+ ##
108
+ # Builds a top-level auto-source feed config for the public shortcut APIs.
109
+ #
110
+ # @param url [String] source page URL
111
+ # @param items_selector [String, nil] optional selector hint for item extraction
112
+ # @param request_controls [Html2rss::RequestControls, nil] explicit request controls to write
113
+ # @return [Hash<Symbol, Object>] feed config hash ready for {from_hash}
114
+ def auto_source_config(url:, items_selector: nil, request_controls: nil)
115
+ config = {
116
+ channel: default_config[:channel].merge(url:),
117
+ auto_source: AutoSource::DEFAULT_CONFIG
118
+ }
119
+
120
+ request_controls ||= Html2rss::RequestControls.new
121
+ request_controls.apply_to(config)
122
+
123
+ config[:selectors] = { items: { selector: items_selector, enhance: true } } if items_selector
124
+ config
125
+ end
126
+
127
+ ##
128
+ # Provides a default configuration.
129
+ #
130
+ # @return [Hash<Symbol, Object>] a hash with default configuration values.
131
+ def default_config
132
+ {
133
+ strategy: RequestService.default_strategy_name,
134
+ request: {
135
+ max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
136
+ max_requests: RequestService::Policy::DEFAULTS[:max_requests]
137
+ },
138
+ channel: { time_zone: 'UTC' },
139
+ headers: RequestHeaders.browser_defaults,
140
+ stylesheets: []
141
+ }
142
+ end
143
+
144
+ private
145
+
146
+ def resolve_effective_config(config, params:)
147
+ effective_config = deep_dup(config)
148
+ resolved_params = parameter_defaults(effective_config)
149
+ resolved_params.merge!(params) unless params.equal?(UNSET) || params.nil?
150
+
151
+ DynamicParams.call(effective_config[:headers], resolved_params)
152
+ DynamicParams.call(effective_config[:channel], resolved_params)
153
+
154
+ effective_config
155
+ end
156
+
157
+ def parameter_defaults(config)
158
+ config.fetch(:parameters, {})
159
+ .filter_map do |name, definition|
160
+ [name, definition[:default]] if definition.is_a?(Hash) && definition.key?(:default)
161
+ end
162
+ .to_h
163
+ end
164
+
165
+ def prepare_for_validation(config)
166
+ Config::Preparer.new.call(deep_dup(config))
167
+ end
168
+
169
+ # rubocop:disable Metrics/MethodLength
170
+ def deep_dup(object)
171
+ case object
172
+ when Hash
173
+ object.transform_values do |value|
174
+ deep_dup(value)
175
+ end
176
+ when Array
177
+ object.map { |value| deep_dup(value) }
178
+ else
179
+ begin
180
+ object.dup
181
+ rescue TypeError
182
+ object
183
+ end
184
+ end
185
+ end
186
+ # rubocop:enable Metrics/MethodLength
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ # Processes and applies dynamic parameter formatting in configuration values.
6
+ class DynamicParams
7
+ class ParamsMissing < Html2rss::Error; end
8
+
9
+ class << self
10
+ # Recursively traverses the given value and formats any strings containing
11
+ # placeholders with values from the provided params.
12
+ #
13
+ # @param value [String, Hash, Enumerable, Object] The value to process.
14
+ # @param params [Hash] The parameters for substitution.
15
+ # @param getter [Proc, nil] Optional proc to retrieve a key's value.
16
+ # @param replace_missing_with [Object, nil] Value to substitute if a key is missing.
17
+ # @return [Object] The processed value.
18
+ def call(value, params = {}, getter: nil, replace_missing_with: nil)
19
+ case value
20
+ when String
21
+ from_string(value, params, getter:, replace_missing_with:)
22
+ when Hash
23
+ from_hash(value, params, getter:, replace_missing_with:)
24
+ when Enumerable
25
+ from_enumerable(value, params, getter:, replace_missing_with:)
26
+ else
27
+ value
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ def format_params(params, getter:, replace_missing_with:)
34
+ Hash.new do |hash, key|
35
+ hash[key] = if getter
36
+ getter.call(key)
37
+ else
38
+ params.fetch(key.to_sym) { params[key.to_s] }
39
+ end
40
+ hash[key] = replace_missing_with if hash[key].nil? && !replace_missing_with.nil?
41
+ hash[key]
42
+ end
43
+ end
44
+
45
+ def from_string(string, params, getter:, replace_missing_with:)
46
+ # Return the original string if no format placeholders are found.
47
+ return string unless /%\{[^{}]*\}|%<[^<>]*>/.match?(string)
48
+
49
+ mapping = format_params(params, getter:, replace_missing_with:)
50
+ format(string, mapping)
51
+ rescue KeyError => error
52
+ raise ParamsMissing, "Missing parameter for formatting: #{error.message}" if replace_missing_with.nil?
53
+
54
+ string
55
+ end
56
+
57
+ def from_hash(hash, params, getter:, replace_missing_with:)
58
+ hash.transform_keys!(&:to_sym)
59
+ hash.transform_values! { |value| call(value, params, getter:, replace_missing_with:) }
60
+ end
61
+
62
+ def from_enumerable(enumerable, params, getter:, replace_missing_with:)
63
+ enumerable.map! { |value| call(value, params, getter:, replace_missing_with:) }
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ # Handles multiple feeds within a single configuration hash.
6
+ # Individual feed configurations should be placed under the :feeds key,
7
+ # where each feed name is the key for its feed configuration.
8
+ # All global configuration keys (outside :feeds) are merged into each feed's settings.
9
+ class MultipleFeedsConfig
10
+ CONFIG_KEY_FEEDS = :feeds
11
+
12
+ class << self
13
+ # Merges global configuration into each feed's configuration.
14
+ #
15
+ # @param config [Hash] The feed-specific configuration.
16
+ # @param yaml [Hash] The full YAML configuration.
17
+ # @param multiple_feeds_key [Symbol] The key under which multiple feeds are defined.
18
+ # @return [Hash] The merged configuration.
19
+ def to_single_feed(config, yaml, multiple_feeds_key: CONFIG_KEY_FEEDS)
20
+ global_keys = yaml.keys - [multiple_feeds_key]
21
+ global_keys.each do |key|
22
+ config[key] = merge_key(config, yaml, key)
23
+ end
24
+ config
25
+ end
26
+
27
+ private
28
+
29
+ # Merges a specific global key from the YAML configuration into the feed configuration.
30
+ #
31
+ # @param config [Hash] The feed-specific configuration.
32
+ # @param yaml [Hash] The full YAML configuration.
33
+ # @param key [Symbol] The global configuration key to merge.
34
+ # @return [Object] The merged value for the key.
35
+ def merge_key(config, yaml, key)
36
+ global_value = yaml.fetch(key, nil)
37
+ local_value = config[key]
38
+ case local_value
39
+ when Hash
40
+ global_value.is_a?(Hash) ? global_value.merge(local_value) : local_value
41
+ when Array
42
+ global_value.is_a?(Array) ? global_value + local_value : local_value
43
+ else
44
+ global_value
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end