html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Normalizes HTTP headers for outgoing requests.
7
+ # Ensures a browser-like baseline while respecting caller overrides.
8
+ class RequestHeaders
9
+ DEFAULT_ACCEPT = %w[
10
+ text/html
11
+ application/xhtml+xml
12
+ application/xml;q=0.9
13
+ image/avif
14
+ image/webp
15
+ image/apng
16
+ */*;q=0.8
17
+ ].join(',')
18
+
19
+ DEFAULT_USER_AGENT = [
20
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
21
+ 'AppleWebKit/537.36 (KHTML, like Gecko)',
22
+ 'Chrome/123.0.0.0',
23
+ 'Safari/537.36'
24
+ ].join(' ')
25
+
26
+ DEFAULT_HEADERS = {
27
+ 'Accept' => DEFAULT_ACCEPT,
28
+ 'Cache-Control' => 'max-age=0',
29
+ 'Connection' => 'keep-alive',
30
+ 'Sec-Fetch-Dest' => 'document',
31
+ 'Sec-Fetch-Mode' => 'navigate',
32
+ 'Sec-Fetch-Site' => 'none',
33
+ 'Sec-Fetch-User' => '?1',
34
+ 'Upgrade-Insecure-Requests' => '1',
35
+ 'User-Agent' => DEFAULT_USER_AGENT
36
+ }.freeze
37
+
38
+ class << self
39
+ ##
40
+ # @return [Hash<String, String>] the unmodified default header set
41
+ def browser_defaults
42
+ DEFAULT_HEADERS.dup
43
+ end
44
+
45
+ ##
46
+ # Normalizes the provided headers while applying Html2rss defaults.
47
+ #
48
+ # @param headers [Hash, nil] caller provided headers
49
+ # @param channel_language [String, nil] language defined on the channel
50
+ # @param url [String] request URL used to infer the Host header
51
+ # @return [Hash<String, String>] normalized HTTP headers
52
+ def normalize(headers, channel_language:, url:)
53
+ new(headers || {}, channel_language:, url:).to_h
54
+ end
55
+ end
56
+
57
+ def initialize(headers, channel_language:, url:)
58
+ @headers = headers
59
+ @channel_language = channel_language
60
+ @url = url
61
+ end
62
+
63
+ ##
64
+ # @return [Hash<String, String>] normalized HTTP headers
65
+ def to_h
66
+ defaults = DEFAULT_HEADERS.dup
67
+ normalized = normalize_custom_headers(headers)
68
+
69
+ accept_override = normalized.delete('Accept')
70
+ defaults.merge!(normalized)
71
+
72
+ defaults['Accept'] = normalize_accept(accept_override)
73
+ defaults['Accept-Language'] = build_accept_language
74
+ defaults['Host'] ||= request_host
75
+
76
+ defaults.compact
77
+ end
78
+
79
+ private
80
+
81
+ attr_reader :headers, :channel_language, :url
82
+
83
+ def normalize_custom_headers(custom)
84
+ custom.transform_keys { canonicalize(_1) }
85
+ end
86
+
87
+ def canonicalize(key)
88
+ key.to_s.split('-').map!(&:capitalize).join('-')
89
+ end
90
+
91
+ def normalize_accept(override)
92
+ return DEFAULT_ACCEPT if override.nil? || override.empty?
93
+
94
+ values = accept_values(DEFAULT_ACCEPT)
95
+
96
+ accept_values(override).reverse_each do |value|
97
+ next if values.include?(value)
98
+
99
+ values.unshift(value)
100
+ end
101
+
102
+ values.join(',')
103
+ end
104
+
105
+ def accept_values(header)
106
+ header.split(',').map!(&:strip).reject(&:empty?)
107
+ end
108
+
109
+ def build_accept_language
110
+ language = channel_language.to_s.strip
111
+ return 'en-US,en;q=0.9' if language.empty?
112
+
113
+ normalized = language.tr('_', '-')
114
+ primary, region = normalized.split('-', 2)
115
+ primary = primary.downcase
116
+ region = region&.upcase
117
+
118
+ return primary if region.nil?
119
+
120
+ "#{primary}-#{region},#{primary};q=0.9"
121
+ end
122
+
123
+ def request_host
124
+ return nil if url.nil? || url.empty?
125
+
126
+ Html2rss::Url.from_absolute(url).host
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,208 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Builds the exported configuration JSON Schema from the runtime validators.
7
+ module Schema
8
+ module_function
9
+
10
+ SCHEMA_FILENAME = 'html2rss-config.schema.json'
11
+
12
+ ##
13
+ # Returns the exported configuration JSON Schema.
14
+ #
15
+ # @return [Hash<String, Object>] JSON Schema represented as a Ruby hash
16
+ def json_schema
17
+ load_json_schema_extension!
18
+ Builder.call
19
+ end
20
+
21
+ ##
22
+ # Resolves the packaged schema path used by downstream tools.
23
+ #
24
+ # @return [String] absolute path to the packaged JSON schema file
25
+ def path
26
+ search_path = File.expand_path(__dir__)
27
+
28
+ loop do
29
+ candidate = File.join(search_path, 'schema', SCHEMA_FILENAME)
30
+ return candidate if File.exist?(candidate)
31
+
32
+ parent_path = File.dirname(search_path)
33
+ break if parent_path == search_path
34
+
35
+ search_path = parent_path
36
+ end
37
+
38
+ File.expand_path("../../../schema/#{SCHEMA_FILENAME}", __dir__)
39
+ end
40
+
41
+ def load_json_schema_extension!
42
+ require 'dry/schema/extensions/json_schema'
43
+ Dry::Schema.load_extensions(:json_schema)
44
+ end
45
+
46
+ ##
47
+ # Orchestrates schema assembly from runtime validator contracts plus
48
+ # client-facing overlays.
49
+ class Builder
50
+ class << self
51
+ def call
52
+ new.call
53
+ end
54
+ end
55
+
56
+ def call
57
+ schema = validator_schema
58
+ apply_top_level(schema)
59
+ assign_properties(schema.fetch(:properties))
60
+ DeepStringifier.call(schema)
61
+ end
62
+
63
+ private
64
+
65
+ def validator_schema
66
+ Html2rss::Config::Validator.new.schema.json_schema(loose: true)
67
+ end
68
+
69
+ def apply_top_level(schema)
70
+ schema['$schema'] = 'https://json-schema.org/draft/2020-12/schema'
71
+ schema[:anyOf] = [
72
+ { 'required' => ['selectors'] },
73
+ { 'required' => ['auto_source'] }
74
+ ]
75
+ end
76
+
77
+ def assign_properties(properties)
78
+ properties.merge!(
79
+ headers: Components.headers,
80
+ stylesheets: Components.stylesheets,
81
+ auto_source: Components.auto_source,
82
+ selectors: Components.selectors
83
+ )
84
+ properties.delete(:dynamic_params_error)
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Exposes schema fragments that populate the top-level configuration schema.
90
+ module Components
91
+ module_function
92
+
93
+ def headers
94
+ {
95
+ type: 'object',
96
+ description: 'HTTP headers applied to every request.',
97
+ additionalProperties: { type: 'string' }
98
+ }
99
+ end
100
+
101
+ def stylesheets
102
+ {
103
+ type: 'array',
104
+ description: 'Collection of stylesheets to attach to the RSS feed.',
105
+ items: Html2rss::Config::Validator::StylesheetConfig.json_schema(loose: true)
106
+ }
107
+ end
108
+
109
+ def auto_source
110
+ schema = Html2rss::AutoSource::Config.json_schema(loose: true)
111
+ schema[:default] = DeepStringifier.call(Html2rss::AutoSource::DEFAULT_CONFIG)
112
+ schema
113
+ end
114
+
115
+ def selectors
116
+ Selectors.schema
117
+ end
118
+ end
119
+
120
+ ##
121
+ # Provides schema fragments that document selector configuration.
122
+ module Selectors
123
+ module_function
124
+
125
+ RESERVED_SELECTOR_PATTERN = '^(?!items$|enclosure$|guid$|categories$).+$'
126
+
127
+ def schema
128
+ {
129
+ type: 'object',
130
+ description: 'Selectors used to extract article attributes.',
131
+ properties: selector_properties,
132
+ patternProperties: pattern_properties,
133
+ additionalProperties: true
134
+ }
135
+ end
136
+
137
+ # rubocop:disable Layout/LineLength
138
+ def selector_properties
139
+ {
140
+ items: items_schema,
141
+ enclosure: enclosure_schema,
142
+ guid: reference_array('List of selector keys used to build the GUID. Each entry must reference a sibling selector key; runtime validation enforces those references.'),
143
+ categories: reference_array('List of selector keys whose values will be used as categories. Each entry must reference a sibling selector key; runtime validation enforces those references.')
144
+ }
145
+ end
146
+ # rubocop:enable Layout/LineLength
147
+
148
+ def pattern_properties
149
+ { RESERVED_SELECTOR_PATTERN => dynamic_selector_schema }
150
+ end
151
+
152
+ def dynamic_selector_schema
153
+ Html2rss::Selectors::Config::Selector.new.schema.json_schema(loose: true).merge(
154
+ description: 'Dynamic selector definition keyed by attribute name.'
155
+ )
156
+ end
157
+
158
+ def items_schema
159
+ Html2rss::Selectors::Config::Items.new.schema.json_schema(loose: true).merge(
160
+ description: 'Defines the items selector and optional enhancement settings.'
161
+ )
162
+ end
163
+
164
+ def enclosure_schema
165
+ Html2rss::Selectors::Config::Enclosure.new.schema.json_schema(loose: true).merge(
166
+ description: 'Describes enclosure extraction settings.'
167
+ )
168
+ end
169
+
170
+ # JSON Schema can enforce non-empty reference arrays, while runtime
171
+ # validation remains authoritative for checking that each entry points
172
+ # to an existing sibling selector key.
173
+ def reference_array(description)
174
+ {
175
+ type: 'array',
176
+ description:,
177
+ minItems: 1,
178
+ items: {
179
+ type: 'string',
180
+ description: 'Selector key defined elsewhere in this object.'
181
+ }
182
+ }
183
+ end
184
+ end
185
+
186
+ ##
187
+ # Converts nested hash keys to strings so the resulting schema serializes cleanly.
188
+ module DeepStringifier
189
+ module_function
190
+
191
+ def call(object)
192
+ case object
193
+ when Hash
194
+ stringify_hash(object)
195
+ when Array
196
+ object.map { |value| call(value) }
197
+ else
198
+ object
199
+ end
200
+ end
201
+
202
+ def stringify_hash(object)
203
+ object.to_h { |key, value| [key.to_s, call(value)] }
204
+ end
205
+ end
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'dry-validation'
4
+
5
+ module Html2rss
6
+ class Config
7
+ # Validates the configuration hash using Dry::Validation.
8
+ # The configuration options adhere to the documented schema in README.md.
9
+ class Validator < Dry::Validation::Contract
10
+ URI_REGEXP = Url::URI_REGEXP
11
+ STYLESHEET_TYPES = RssBuilder::Stylesheet::TYPES
12
+ LANGUAGE_FORMAT_REGEX = /\A[a-z]{2}(-[A-Z]{2})?\z/
13
+
14
+ ChannelConfig = Dry::Schema.Params do
15
+ required(:url).filled(:string, format?: URI_REGEXP)
16
+ optional(:title).maybe(:string)
17
+ optional(:description).maybe(:string)
18
+ optional(:language).maybe(:string, format?: LANGUAGE_FORMAT_REGEX)
19
+ optional(:ttl).maybe(:integer, gt?: 0)
20
+ optional(:time_zone).maybe(:string)
21
+ end
22
+
23
+ StylesheetConfig = Dry::Schema.Params do
24
+ required(:href).filled(:string)
25
+ required(:type).filled(:string, included_in?: STYLESHEET_TYPES)
26
+ optional(:media).maybe(:string)
27
+ end
28
+
29
+ BrowserlessPreloadClickSelectorConfig = Dry::Schema.Params do
30
+ required(:selector).filled(:string)
31
+ optional(:max_clicks).filled(:integer, gt?: 0)
32
+ optional(:wait_after_ms).filled(:integer, gteq?: 0)
33
+ end
34
+
35
+ BrowserlessPreloadScrollConfig = Dry::Schema.Params do
36
+ optional(:iterations).filled(:integer, gt?: 0)
37
+ optional(:wait_after_ms).filled(:integer, gteq?: 0)
38
+ end
39
+
40
+ BrowserlessPreloadConfig = Dry::Schema.Params do
41
+ optional(:wait_after_ms).filled(:integer, gteq?: 0)
42
+ optional(:click_selectors).array(BrowserlessPreloadClickSelectorConfig)
43
+ optional(:scroll_down).hash(BrowserlessPreloadScrollConfig)
44
+ end
45
+
46
+ BrowserlessRequestConfig = Dry::Schema.Params do
47
+ optional(:preload).hash(BrowserlessPreloadConfig)
48
+ end
49
+
50
+ RequestConfig = Dry::Schema.Params do
51
+ optional(:max_redirects).filled(:integer, gteq?: 0)
52
+ optional(:max_requests).filled(:integer, gt?: 0)
53
+ optional(:browserless).hash(BrowserlessRequestConfig)
54
+ end
55
+
56
+ params do
57
+ required(:strategy).filled(:symbol)
58
+ required(:channel).hash(ChannelConfig)
59
+ optional(:headers).hash
60
+ optional(:stylesheets).array(StylesheetConfig)
61
+ optional(:auto_source).hash(AutoSource::Config)
62
+ optional(:selectors).hash
63
+ optional(:dynamic_params_error).maybe(:string)
64
+ optional(:request).hash(RequestConfig)
65
+ end
66
+
67
+ rule(:headers) do
68
+ value&.each do |key, header_value|
69
+ unless header_value.is_a?(String)
70
+ key([:headers, key]).failure("must be a String, but got #{header_value.class}")
71
+ end
72
+ end
73
+ end
74
+
75
+ rule(:dynamic_params_error) do
76
+ base.failure(value) if value
77
+ end
78
+
79
+ # Ensure at least one of :selectors or :auto_source is present.
80
+ rule(:selectors, :auto_source) do
81
+ unless values.key?(:selectors) || values.key?(:auto_source)
82
+ base.failure("Configuration must include at least 'selectors' or 'auto_source'")
83
+ end
84
+ end
85
+
86
+ rule(:selectors) do
87
+ next unless value
88
+
89
+ errors = Html2rss::Selectors::Config.call(value).errors
90
+ errors.each { |error| key(:selectors).failure(error.text) } unless errors.empty?
91
+ end
92
+
93
+ # URL validation delegated to Url class
94
+ rule(:channel) do
95
+ next unless values[:channel]&.key?(:url)
96
+
97
+ url_string = values[:channel][:url]
98
+ next if url_string.nil? || url_string.empty?
99
+
100
+ begin
101
+ Html2rss::Url.for_channel(url_string)
102
+ rescue ArgumentError => error
103
+ key(%i[channel url]).failure(error.message)
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -1,82 +1,133 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'forwardable'
3
+ require 'json'
4
+ require 'yaml'
4
5
 
5
6
  module Html2rss
6
7
  ##
7
- # The Config class abstracts from the config data structure and
8
- # provides default values.
8
+ # The provided configuration is used to generate the RSS feed.
9
+ # This class provides methods to load and process configuration from a YAML file,
10
+ # supporting both single and multiple feed configurations.
11
+ #
12
+ # Configuration is validated during initialization.
9
13
  class Config
10
- extend Forwardable
14
+ class InvalidConfig < Html2rss::Error; end
15
+ extend ClassMethods
11
16
 
12
17
  ##
13
- # The Error class to be thrown when a feed config requires params, but none
14
- # were passed to Config.
15
- class ParamsMissing < Html2rss::Error; end
16
-
17
- ##
18
- # Thrown when the feed config does not contain a value at `:channel`.
19
- class ChannelMissing < Html2rss::Error; end
20
-
21
- def_delegator :@channel, :author
22
- def_delegator :@channel, :ttl
23
- def_delegator :@channel, :title
24
- def_delegator :@channel, :language
25
- def_delegator :@channel, :description
26
- def_delegator :@channel, :url
27
- def_delegator :@channel, :url, :link
28
- def_delegator :@channel, :time_zone
29
- def_delegator :@channel, :json?
30
- def_delegator :@channel, :strategy
31
-
32
- def_delegator :@selectors, :item_selector_names
33
- def_delegator :@selectors, :selector?
34
- def_delegator :@selectors, :category_selector_names
35
- def_delegator :@selectors, :guid_selector_names
36
- def_delegator :@selectors, :items_order
37
- def_delegator :@selectors, :selector_string
38
-
39
- ##
40
- # Initializes the Config object with feed configuration, global settings, and parameters.
18
+ # Initializes the configuration object.
41
19
  #
42
- # @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
43
- # @param global [Hash<Symbol, Object>] Global settings hash.
44
- # @param params [Hash<Symbol, String>] Parameters hash.
45
- def initialize(feed_config, global = {}, params = {})
46
- channel_config = feed_config[:channel]
47
- raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
48
-
49
- @channel = Channel.new(channel_config, params:)
50
- @selectors = Selectors.new(feed_config[:selectors])
51
- @global = global
20
+ # Processes deprecated attributes, applies default values, and validates the configuration.
21
+ #
22
+ # @param config [Hash<Symbol, Object>] the configuration hash.
23
+ # @raise [InvalidConfig] if the configuration fails validation.
24
+ def initialize(config)
25
+ @request_controls = RequestControls.from_config(config)
26
+ prepared_config = Preparer.new.call(config)
27
+ validated_config = validated_config_for(prepared_config)
28
+
29
+ @config = validated_config.freeze
30
+ @request_controls = request_controls.with_effective_values(
31
+ strategy: validated_config[:strategy],
32
+ max_redirects: validated_config.dig(:request, :max_redirects),
33
+ max_requests: validated_config.dig(:request, :max_requests)
34
+ )
52
35
  end
53
36
 
37
+ def strategy = request_controls.strategy
38
+ def max_redirects = request_controls.max_redirects
39
+ def max_requests = request_controls.max_requests
40
+ def stylesheets = config[:stylesheets]
41
+
54
42
  ##
55
- # Retrieves selector attributes merged with channel attributes.
56
- #
57
- # @param name [Symbol] Selector name.
58
- # @return [Hash<Symbol, Object>] Merged attributes hash.
59
- def selector_attributes_with_channel(name)
60
- @selectors.selector(name).to_h.merge(channel: @channel)
43
+ # @return [Boolean] whether max_requests was explicitly configured by the caller
44
+ def explicit_max_requests?
45
+ request_controls.explicit?(:max_requests)
61
46
  end
62
47
 
63
48
  ##
64
- # Retrieves headers merged from global settings and channel headers.
65
- #
66
- # @return [Hash] Merged headers hash.
67
- def headers
68
- @global.fetch(:headers, {}).merge(@channel.headers)
49
+ # @return [Html2rss::RequestControls] request controls with provenance
50
+ attr_reader :request_controls
51
+
52
+ def headers = config[:headers]
53
+ def channel = config[:channel]
54
+ def url = config.dig(:channel, :url)
55
+ def time_zone = config.dig(:channel, :time_zone)
56
+
57
+ def request = config[:request]
58
+
59
+ def selectors = config[:selectors]
60
+ def auto_source = config[:auto_source]
61
+
62
+ private
63
+
64
+ attr_reader :config
65
+
66
+ # Normalizes raw config input before validation.
67
+ class Preparer
68
+ ##
69
+ # @param config [Hash<Symbol, Object>] raw config input
70
+ # @return [Hash<Symbol, Object>] config with defaults and deprecations applied
71
+ def call(config)
72
+ config = config.dup if config.frozen?
73
+
74
+ config = handle_deprecated_channel_attributes(config)
75
+ config = apply_default_config(config)
76
+ config = apply_default_selectors_config(config) if config[:selectors]
77
+ config = apply_default_auto_source_config(config) if config[:auto_source]
78
+
79
+ config
80
+ end
81
+
82
+ private
83
+
84
+ def handle_deprecated_channel_attributes(config)
85
+ { strategy: RequestService.default_strategy_name, headers: {} }.each_pair do |key, default_value|
86
+ if !config[key] && (value = config.dig(:channel, key))
87
+ Log.warn("The `channel.#{key}` key is deprecated. Please move the definition of `#{key}` to the top level.")
88
+ config[key] = value
89
+ end
90
+
91
+ config[key] ||= default_value
92
+ end
93
+
94
+ config
95
+ end
96
+
97
+ def apply_default_config(config)
98
+ deep_merge(Config.default_config, config)
99
+ end
100
+
101
+ def apply_default_selectors_config(config)
102
+ deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
103
+ end
104
+
105
+ def apply_default_auto_source_config(config)
106
+ deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
107
+ end
108
+
109
+ def deep_merge(base_config, override_config)
110
+ base_config.merge(override_config) do |_key, oldval, newval|
111
+ oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
112
+ end
113
+ end
69
114
  end
70
115
 
71
- ##
72
- # Retrieves stylesheets from global settings.
73
- #
74
- # @return [Array<Stylesheet>] Array of Stylesheet structs.
75
- def stylesheets
76
- @global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
116
+ def validated_config_for(config)
117
+ validator = Validator.new.call(config)
118
+
119
+ raise InvalidConfig, "Invalid configuration: #{validator.errors.to_h}" unless validator.success?
120
+
121
+ normalized_headers(validator.to_h)
77
122
  end
78
123
 
79
- # Provides read-only access to the channel object.
80
- attr_reader :channel
124
+ def normalized_headers(validated_config)
125
+ validated_config[:headers] = RequestHeaders.normalize(
126
+ validated_config[:headers],
127
+ channel_language: validated_config.dig(:channel, :language),
128
+ url: validated_config.dig(:channel, :url)
129
+ )
130
+ validated_config
131
+ end
81
132
  end
82
133
  end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ # The Html2rss::Error base class.
5
+ class Error < StandardError; end
6
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ # Extracts the earliest date from an article_tag.
6
+ class DateExtractor
7
+ # @return [DateTime, nil]
8
+ def self.call(article_tag)
9
+ times = article_tag.css('[datetime]').filter_map do |tag|
10
+ DateTime.parse(tag['datetime'])
11
+ rescue ArgumentError, TypeError
12
+ nil
13
+ end
14
+
15
+ times.min
16
+ end
17
+ end
18
+ end
19
+ end