html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,262 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable/uri'
4
+ require 'cgi'
5
+
6
+ module Html2rss
7
+ ##
8
+ # A value object representing a resolved, absolute URL with built-in operations.
9
+ # Provides URL resolution, sanitization, and titleization capabilities.
10
+ #
11
+ # @example Creating a URL from a relative path
12
+ # url = Url.from_relative('/path/to/article', 'https://example.com')
13
+ # url.to_s # => "https://example.com/path/to/article"
14
+ #
15
+ # @example Sanitizing a raw URL string
16
+ # url = Url.sanitize('https://example.com/ ')
17
+ # url.to_s # => "https://example.com/"
18
+ #
19
+ # @example Getting titleized versions
20
+ # url = Url.from_relative('/foo-bar/baz.txt', 'https://example.com')
21
+ # url.titleized # => "Foo Bar Baz"
22
+ # url.channel_titleized # => "example.com: Foo Bar Baz"
23
+ class Url
24
+ include Comparable
25
+
26
+ # Regular expression for basic URI format validation
27
+ URI_REGEXP = Addressable::URI::URIREGEX
28
+ SUPPORTED_SCHEMES = %w[http https].to_set.freeze
29
+
30
+ ##
31
+ # Creates a URL from a relative path and base URL.
32
+ #
33
+ # @param relative_url [String, Html2rss::Url] the relative URL to resolve
34
+ # @param base_url [String, Html2rss::Url] the base URL to resolve against
35
+ # @return [Url] the resolved absolute URL
36
+ # @raise [ArgumentError] if the URL cannot be parsed
37
+ def self.from_relative(relative_url, base_url)
38
+ url = Addressable::URI.parse(relative_url.to_s.strip)
39
+ return new(url) if url.absolute?
40
+
41
+ base_uri = Addressable::URI.parse(base_url.to_s)
42
+ base_uri.path = '/' if base_uri.path.empty?
43
+
44
+ new(base_uri.join(url).normalize)
45
+ end
46
+
47
+ ##
48
+ # Creates a URL by sanitizing a raw URL string.
49
+ # Removes spaces and extracts the first valid URL from the string.
50
+ #
51
+ # @param raw_url [String] the raw URL string to sanitize
52
+ # @return [Url, nil] the sanitized URL, or nil if no valid URL found
53
+ def self.sanitize(raw_url)
54
+ matched_urls = raw_url.to_s.scan(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
55
+ url = matched_urls.first.to_s.strip
56
+ return nil if url.empty?
57
+
58
+ new(Addressable::URI.parse(url).normalize)
59
+ end
60
+
61
+ ##
62
+ # Creates a URL from an already-absolute URL string.
63
+ #
64
+ # @param url_string [String, Html2rss::Url] the absolute URL to parse
65
+ # @return [Url] the parsed and normalized URL
66
+ # @raise [ArgumentError] if the URL is not absolute or cannot be parsed
67
+ def self.from_absolute(url_string)
68
+ return url_string if url_string.is_a?(self)
69
+
70
+ url = new(Addressable::URI.parse(url_string.to_s.strip).normalize)
71
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
72
+
73
+ url
74
+ rescue Addressable::URI::InvalidURIError
75
+ raise ArgumentError, 'URL must be absolute'
76
+ end
77
+
78
+ ##
79
+ # Creates a URL for channel use with validation.
80
+ # Validates that the URL meets channel requirements (absolute, no @, supported schemes).
81
+ #
82
+ # @param url_string [String] the URL string to validate and parse
83
+ # @return [Url] the validated and parsed URL
84
+ # @raise [ArgumentError] if the URL doesn't meet channel requirements
85
+ # @example Creating a channel URL
86
+ # Url.for_channel('https://example.com')
87
+ # # => #<Html2rss::Url:... @uri=#<Addressable::URI:... URI:https://example.com>>
88
+ # @example Invalid channel URL
89
+ # Url.for_channel('/relative/path')
90
+ # # => raises ArgumentError: "URL must be absolute"
91
+ def self.for_channel(url_string)
92
+ return nil if url_string.nil? || url_string.empty?
93
+
94
+ stripped = url_string.strip
95
+ return nil if stripped.empty?
96
+
97
+ url = from_absolute(stripped)
98
+ validate_channel_url(url)
99
+ url
100
+ end
101
+
102
+ ##
103
+ # Validates that a URL meets channel requirements.
104
+ #
105
+ # @param url [Url] the URL to validate
106
+ # @raise [ArgumentError] if the URL doesn't meet channel requirements
107
+ def self.validate_channel_url(url)
108
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
109
+
110
+ raise ArgumentError, 'URL must not contain an @ character' if url.to_s.include?('@')
111
+
112
+ scheme = url.scheme
113
+ raise ArgumentError, "URL scheme '#{scheme}' is not supported" unless SUPPORTED_SCHEMES.include?(scheme)
114
+ end
115
+
116
+ private_class_method :validate_channel_url
117
+
118
+ ##
119
+ # @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
120
+ def initialize(uri)
121
+ @uri = uri.freeze
122
+ freeze
123
+ end
124
+
125
+ # Delegate common URI operations to the underlying URI
126
+ def to_s = @uri.to_s
127
+ def scheme = @uri.scheme
128
+ def host = @uri.host
129
+ def port = @uri.port
130
+ def path = @uri.path
131
+ def query = @uri.query
132
+ def fragment = @uri.fragment
133
+ def absolute? = @uri.absolute?
134
+
135
+ ##
136
+ # Returns the URL query string as a hash of string keys and values.
137
+ #
138
+ # @return [Hash{String => String}] normalized query parameters
139
+ def query_values
140
+ @uri.query_values(Hash) || {}
141
+ end
142
+
143
+ ##
144
+ # Returns the URL path split into non-empty segments.
145
+ #
146
+ # @return [Array<String>] normalized path segments
147
+ def path_segments
148
+ @uri.path.to_s.split('/').reject(&:empty?)
149
+ end
150
+
151
+ ##
152
+ # Returns a copy of the URL with the provided path.
153
+ #
154
+ # @param path [String] normalized absolute path
155
+ # @return [Url] a new URL with the updated path
156
+ def with_path(path)
157
+ uri = @uri.dup
158
+ uri.path = path
159
+ self.class.from_absolute(uri.normalize.to_s)
160
+ end
161
+
162
+ ##
163
+ # Returns a copy of the URL with the provided query values.
164
+ #
165
+ # @param values [Hash{String, Symbol => #to_s}] query parameters to assign
166
+ # @return [Url] a new URL with the updated query string
167
+ def with_query_values(values)
168
+ uri = @uri.dup
169
+ uri.query_values = values.transform_keys(&:to_s).transform_values(&:to_s)
170
+ self.class.from_absolute(uri.normalize.to_s)
171
+ end
172
+
173
+ ##
174
+ # Returns a titleized representation of the URL path.
175
+ # Converts the path to a human-readable title by cleaning and capitalizing words.
176
+ # Removes file extensions and special characters, then capitalizes each word.
177
+ #
178
+ # @return [String] the titleized path, or empty string if path is empty
179
+ # @example Basic titleization
180
+ # url = Url.from_absolute('https://example.com/foo-bar/baz.txt')
181
+ # url.titleized # => "Foo Bar Baz"
182
+ # @example With URL encoding
183
+ # url = Url.from_absolute('https://example.com/hello%20world/article.html')
184
+ # url.titleized # => "Hello World Article"
185
+ def titleized
186
+ path = @uri.path
187
+ return '' if path.empty?
188
+
189
+ nicer_path = CGI.unescapeURIComponent(path)
190
+ .split('/')
191
+ .flat_map do |part|
192
+ part.gsub(/[^a-zA-Z0-9.]/, ' ').gsub(/\s+/, ' ').split
193
+ end
194
+
195
+ nicer_path.map!(&:capitalize)
196
+ File.basename(nicer_path.join(' '), '.*')
197
+ end
198
+
199
+ ##
200
+ # Returns a titleized representation of the URL with prefixed host.
201
+ # Creates a channel title by combining host and path information.
202
+ # Useful for RSS channel titles that need to identify the source.
203
+ #
204
+ # @return [String] the titleized channel URL
205
+ # @example With path
206
+ # url = Url.from_absolute('https://example.com/foo-bar/baz')
207
+ # url.channel_titleized # => "example.com: Foo Bar Baz"
208
+ # @example Without path (root URL)
209
+ # url = Url.from_absolute('https://example.com')
210
+ # url.channel_titleized # => "example.com"
211
+ def channel_titleized
212
+ nicer_path = CGI.unescapeURIComponent(@uri.path).split('/').reject(&:empty?)
213
+ host = @uri.host
214
+
215
+ nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
216
+ end
217
+
218
+ ##
219
+ # Compares this URL with another URL for equality.
220
+ # URLs are considered equal if their string representations are the same.
221
+ #
222
+ # @param other [Url] the other URL to compare with
223
+ # @return [Integer] -1, 0, or 1 for less than, equal, or greater than
224
+ def <=>(other)
225
+ to_s <=> other.to_s
226
+ end
227
+
228
+ ##
229
+ # Returns true if this URL is equal to another URL.
230
+ #
231
+ # @param other [Object] the other object to compare with
232
+ # @return [Boolean] true if the URLs are equal
233
+ def ==(other)
234
+ other.is_a?(Url) && to_s == other.to_s
235
+ end
236
+
237
+ ##
238
+ # Supports hash-based comparisons by ensuring equality semantics match `hash`.
239
+ #
240
+ # @param other [Object] the other object to compare with
241
+ # @return [Boolean] true if the URLs are considered equal
242
+ def eql?(other)
243
+ other.is_a?(Url) && to_s == other.to_s
244
+ end
245
+
246
+ ##
247
+ # Returns the hash code for this URL.
248
+ #
249
+ # @return [Integer] the hash code
250
+ def hash
251
+ to_s.hash
252
+ end
253
+
254
+ ##
255
+ # Returns a string representation of the URL for debugging.
256
+ #
257
+ # @return [String] the debug representation
258
+ def inspect
259
+ "#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
260
+ end
261
+ end
262
+ end
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.16.0'
6
+ VERSION = '0.18.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -3,11 +3,10 @@
3
3
  require 'zeitwerk'
4
4
 
5
5
  loader = Zeitwerk::Loader.for_gem
6
+ loader.inflector.inflect('cli' => 'CLI')
6
7
  loader.setup
7
8
 
8
- require 'addressable'
9
9
  require 'logger'
10
- require 'yaml'
11
10
 
12
11
  ##
13
12
  # The Html2rss namespace.
@@ -23,90 +22,150 @@ module Html2rss
23
22
  end
24
23
 
25
24
  ##
26
- # The Html2rss::Error base class.
27
- class Error < StandardError; end
28
-
29
- ##
30
- # Key for the feeds configuration in the YAML file.
31
- CONFIG_KEY_FEEDS = :feeds
32
-
33
- ##
34
- # Returns an RSS object generated from the provided YAML file configuration.
35
- #
36
- # Example:
25
+ # Loads a feed configuration from YAML.
37
26
  #
38
- # feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
39
- # # => #<RSS::Rss:0x00007fb2f6331228
40
- #
41
- # @param file [String] Path to the YAML file.
42
- # @param name [String, Symbol, nil] Name of the feed in the YAML file.
43
- # @param global_config [Hash] Global options (e.g., HTTP headers).
44
- # @param params [Hash] Dynamic parameters for the feed configuration.
45
- # @return [RSS::Rss] RSS object generated from the configuration.
46
- def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
47
- yaml = YAML.safe_load_file(file, symbolize_names: true)
48
- feeds = yaml[CONFIG_KEY_FEEDS] || {}
49
-
50
- feed_config = find_feed_config(yaml, feeds, name, global_config)
51
-
52
- feed(Config.new(feed_config, global_config, params))
27
+ # @param file [String] path to the YAML file
28
+ # @param feed_name [String, nil] optional feed name inside a multi-feed config
29
+ # @return [Hash<Symbol, Object>] loaded configuration hash
30
+ def self.config_from_yaml_file(file, feed_name = nil)
31
+ Config.load_yaml(file, feed_name)
53
32
  end
54
33
 
55
34
  ##
56
35
  # Returns an RSS object generated from the provided configuration.
57
36
  #
58
- # Example:
59
- #
60
- # feed = Html2rss.feed(
61
- # channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
62
- # selectors: {
63
- # items: { selector: '#hot-network-questions > ul > li' },
64
- # title: { selector: 'a' },
65
- # link: { selector: 'a', extractor: 'href' }
66
- # }
67
- # )
68
- # # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
69
- #
70
- # @param config [Hash<Symbol, Object>, Html2rss::Config] Feed configuration.
71
- # @return [RSS::Rss] RSS object generated from the configuration.
72
- def self.feed(config)
73
- config = Config.new(config) unless config.is_a?(Config)
74
- RssBuilder.build(config)
37
+ # @param raw_config [Hash<Symbol, Object>] feed configuration
38
+ # @return [RSS::Rss] generated RSS feed
39
+ def self.feed(raw_config)
40
+ run_pipeline(raw_config) do |response:, config:, articles:|
41
+ build_rss_feed(response:, config:, articles:)
42
+ end
75
43
  end
76
44
 
77
45
  ##
78
- # Builds the feed configuration based on the provided parameters.
46
+ # Returns a JSONFeed 1.1 hash generated from the provided configuration.
79
47
  #
80
- # @param yaml [Hash] Parsed YAML content.
81
- # @param feeds [Hash] Feeds from the YAML content.
82
- # @param feed_name [String, Symbol, nil] Name of the feed in the YAML file.
83
- # @param global_config [Hash] Global options (e.g., HTTP headers).
84
- # @return [Hash] Feed configuration.
85
- def self.find_feed_config(yaml, feeds, feed_name, global_config)
86
- return yaml unless feed_name
87
-
88
- feed_name = feed_name.to_sym
89
- if feeds.key?(feed_name)
90
- global_config.merge!(yaml.reject { |key| key == CONFIG_KEY_FEEDS })
91
- feeds[feed_name]
92
- else
93
- yaml
48
+ # @param raw_config [Hash<Symbol, Object>] feed configuration
49
+ # @return [Hash] JSONFeed-compliant hash
50
+ def self.json_feed(raw_config)
51
+ run_pipeline(raw_config) do |response:, config:, articles:|
52
+ build_json_feed(response:, config:, articles:)
94
53
  end
95
54
  end
96
55
 
97
56
  ##
98
57
  # Scrapes the provided URL and returns an RSS object.
99
- # No need for a "feed config".
100
58
  #
101
- # @param url [String] the URL to automatically source the feed from
102
- # @param strategy [Symbol] the request strategy to use
103
- # @return [RSS::Rss]
104
- def self.auto_source(url, strategy: :faraday)
105
- ctx = RequestService::Context.new(url:, headers: {})
106
- response = RequestService.execute(ctx, strategy:)
107
-
108
- Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
59
+ # @param url [String] source page URL
60
+ # @param strategy [Symbol] request strategy to use
61
+ # @param items_selector [String, nil] optional selector hint for item extraction
62
+ # @param max_redirects [Integer, nil] optional redirect limit override
63
+ # @param max_requests [Integer, nil] optional request budget override
64
+ # @return [RSS::Rss] generated RSS feed
65
+ def self.auto_source(url, strategy: :faraday, items_selector: nil, max_redirects: nil, max_requests: nil)
66
+ feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
67
+ end
68
+
69
+ ##
70
+ # Scrapes the provided URL and returns a JSONFeed 1.1 hash.
71
+ #
72
+ # @param url [String] source page URL
73
+ # @param strategy [Symbol] request strategy to use
74
+ # @param items_selector [String, nil] optional selector hint for item extraction
75
+ # @param max_redirects [Integer, nil] optional redirect limit override
76
+ # @param max_requests [Integer, nil] optional request budget override
77
+ # @return [Hash] JSONFeed-compliant hash
78
+ def self.auto_json_feed(url, strategy: :faraday, items_selector: nil, max_redirects: nil, max_requests: nil)
79
+ json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
109
80
  end
110
81
 
111
- private_class_method :find_feed_config
82
+ class << self
83
+ private
84
+
85
+ def run_pipeline(raw_config)
86
+ # 1. Normalize and validate the user-facing feed config.
87
+ config = Config.from_hash(raw_config, params: raw_config[:params])
88
+ runtime_input = RequestSession::RuntimeInput.from_config(config)
89
+
90
+ # 2. Fetch the initial page using a shared request session.
91
+ request_session = RequestSession.from_runtime_input(runtime_input)
92
+ response = request_session.fetch_initial_response
93
+
94
+ # 3. Collect articles from configured selectors and auto-source scrapers.
95
+ articles = Articles::Deduplicator.new(
96
+ collect_articles(response:, config:, request_session:)
97
+ ).call
98
+
99
+ # 4. Render the final output format chosen by the public entrypoint.
100
+ yield response:, config:, articles:
101
+ end
102
+
103
+ def collect_articles(response:, config:, request_session:)
104
+ selector_articles(response:, config:, request_session:) +
105
+ auto_source_articles(response:, config:, request_session:)
106
+ end
107
+
108
+ def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
109
+ return [] unless (selectors = config.selectors)
110
+
111
+ page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
112
+ RequestSession::RelNextPager.new(
113
+ session: request_session,
114
+ initial_response: response,
115
+ max_pages:
116
+ ).to_a
117
+ else
118
+ [response]
119
+ end
120
+
121
+ page_responses.flat_map do |page_response|
122
+ Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
123
+ end
124
+ end
125
+
126
+ def auto_source_articles(response:, config:, request_session:)
127
+ return [] unless (auto_source = config.auto_source)
128
+
129
+ AutoSource.new(response, auto_source, request_session:).articles
130
+ end
131
+
132
+ def build_rss_feed(response:, config:, articles:)
133
+ channel = RssBuilder::Channel.new(response, overrides: config.channel)
134
+
135
+ RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
136
+ end
137
+
138
+ def build_json_feed(response:, config:, articles:)
139
+ channel = RssBuilder::Channel.new(response, overrides: config.channel)
140
+
141
+ JsonFeedBuilder.new(channel:, articles:).call
142
+ end
143
+
144
+ def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
145
+ keys = []
146
+ keys << :strategy unless strategy == :faraday
147
+ keys << :max_redirects unless max_redirects.nil?
148
+ keys << :max_requests unless max_requests.nil?
149
+ keys
150
+ end
151
+
152
+ def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
153
+ Config.auto_source_config(
154
+ url:,
155
+ items_selector:,
156
+ request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
157
+ )
158
+ end
159
+
160
+ def shortcut_request_controls(strategy:, max_redirects:, max_requests:)
161
+ RequestControls.new(
162
+ strategy:,
163
+ max_redirects:,
164
+ max_requests:,
165
+ explicit_keys: explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
166
+ )
167
+ end
168
+ end
112
169
  end
170
+
171
+ loader.eager_load
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+ require_relative '../html2rss'
6
+
7
+ namespace :config do
8
+ desc 'Generate config JSON schema'
9
+ task :schema do
10
+ destination = Html2rss::Config.schema_path
11
+
12
+ FileUtils.mkdir_p(File.dirname(destination))
13
+ File.write(destination, "#{Html2rss::Config.json_schema_json}\n")
14
+
15
+ puts "Generated config schema at #{destination}"
16
+ end
17
+ end