html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,301 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+
5
+ module Html2rss
6
+ ##
7
+ # This scraper is designed to scrape articles from a given HTML page using CSS
8
+ # selectors defined in the feed config.
9
+ #
10
+ # It supports the traditional feed configs that html2rss originally provided,
11
+ # ensuring compatibility with existing setups.
12
+ #
13
+ # Additionally, it uniquely offers the capability to convert JSON into XML,
14
+ # extending its versatility for diverse data processing workflows.
15
+ class Selectors # rubocop:disable Metrics/ClassLength
16
+ class InvalidSelectorName < Html2rss::Error; end
17
+
18
+ include Enumerable
19
+
20
+ # A context instance passed to item extractors and post-processors.
21
+ Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
22
+
23
+ # Default selectors options merged into user configuration.
24
+ DEFAULT_CONFIG = { items: { enhance: true } }.freeze
25
+
26
+ # Selector key that points to the root list of article nodes.
27
+ ITEMS_SELECTOR_KEY = :items
28
+ # Supported RSS item attributes extractable through selectors.
29
+ ITEM_TAGS = %i[title url description author comments published_at guid enclosure categories].freeze
30
+ # Item attributes that require dedicated extraction logic.
31
+ SPECIAL_ATTRIBUTES = Set[:guid, :enclosure, :categories].freeze
32
+
33
+ # Mapping of new attribute names to their legacy names for backward compatibility.
34
+ RENAMED_ATTRIBUTES = { published_at: %i[updated pubDate] }.freeze
35
+
36
+ ##
37
+ # Initializes a new Selectors instance.
38
+ #
39
+ # @param response [RequestService::Response] The response object.
40
+ # @param selectors [Hash] A hash of CSS selectors.
41
+ # @param time_zone [String] Time zone string used for date parsing.
42
+ def initialize(response, selectors:, time_zone:)
43
+ @response = response
44
+ @url = response.url
45
+ @selectors = selectors
46
+ @time_zone = time_zone
47
+
48
+ prepare_selectors!
49
+ @rss_item_attributes = @selectors.keys & Html2rss::RssBuilder::Article::PROVIDED_KEYS
50
+ end
51
+
52
+ ##
53
+ # Returns articles extracted from the response.
54
+ # Reverses order if config specifies reverse ordering.
55
+ #
56
+ # @return [Array<Html2rss::RssBuilder::Article>]
57
+ def articles
58
+ @articles ||= @selectors.dig(ITEMS_SELECTOR_KEY, :order) == 'reverse' ? to_a.tap(&:reverse!) : to_a
59
+ end
60
+
61
+ ##
62
+ # Iterates over each scraped article.
63
+ #
64
+ # @yield [article] Gives each article as an Html2rss::RssBuilder::Article.
65
+ # @return [Enumerator] An enumerator if no block is given.
66
+ def each(&)
67
+ return enum_for(:each) unless block_given?
68
+
69
+ enhance = enhance?
70
+
71
+ parsed_body.css(items_selector).each do |item|
72
+ article_hash = extract_article(item, response)
73
+
74
+ enhance_article_hash(article_hash, item, response.url) if enhance
75
+
76
+ yield Html2rss::RssBuilder::Article.new(**article_hash, scraper: self.class)
77
+ end
78
+ end
79
+
80
+ ##
81
+ # Returns the CSS selector for the items.
82
+ # @return [String] the CSS selector for the items
83
+ def items_selector = @selectors.dig(ITEMS_SELECTOR_KEY, :selector)
84
+
85
+ ## @return [Boolean] whether to enhance the article hash with auto_source's semantic HTML extraction.
86
+ def enhance? = !!@selectors.dig(ITEMS_SELECTOR_KEY, :enhance)
87
+
88
+ ##
89
+ # Extracts an article hash for a given item element.
90
+ #
91
+ # @param item [Nokogiri::XML::Element] The element to extract from.
92
+ # @param page_response [RequestService::Response] response used for selector extraction context
93
+ # @return [Hash] Hash of attributes for the article.
94
+ def extract_article(item, page_response = response)
95
+ @rss_item_attributes.to_h { |key| [key, select(key, item, base_url: page_response.url)] }.compact
96
+ end
97
+
98
+ ##
99
+ # Enhances the article hash using semantic HTML extraction.
100
+ # Only adds keys that are missing from the original hash.
101
+ #
102
+ # @param article_hash [Hash] The original article hash.
103
+ # @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
104
+ # @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
105
+ # @return [Hash] The enhanced article hash.
106
+ def enhance_article_hash(article_hash, article_tag, base_url = @url)
107
+ selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
108
+ return article_hash unless selected_anchor
109
+
110
+ extracted = HtmlExtractor.new(article_tag, base_url:, selected_anchor:).call
111
+ return article_hash unless extracted
112
+
113
+ extracted.each_with_object(article_hash) do |(key, value), hash|
114
+ next if value.nil? || (hash.key?(key) && hash[key])
115
+
116
+ hash[key] = value
117
+ end
118
+ end
119
+
120
+ ##
121
+ # Selects the value for a given attribute from an HTML element.
122
+ #
123
+ # @param name [Symbol, String] Name of the attribute.
124
+ # @param item [Nokogiri::XML::Element] The HTML element to process.
125
+ # @param base_url [String, Html2rss::Url] base URL for relative extraction values
126
+ # @return [Object, Array<Object>] The selected value(s).
127
+ # @raise [InvalidSelectorName] If the attribute name is invalid or not defined.
128
+ def select(name, item, base_url: @url)
129
+ name = name.to_sym
130
+
131
+ raise InvalidSelectorName, "Attribute selector '#{name}' is reserved for items." if name == ITEMS_SELECTOR_KEY
132
+
133
+ selector_key, config = selector_config_for(name)
134
+
135
+ if SPECIAL_ATTRIBUTES.member?(selector_key)
136
+ select_special(selector_key, item:, config:, base_url:)
137
+ else
138
+ select_regular(selector_key, item:, config:, base_url:)
139
+ end
140
+ end
141
+
142
+ private
143
+
144
+ attr_reader :response
145
+
146
+ def prepare_selectors!
147
+ validate_url_and_link_exclusivity!
148
+ fix_url_and_link!
149
+ handle_renamed_attributes!
150
+ end
151
+
152
+ def validate_url_and_link_exclusivity!
153
+ return unless @selectors.key?(:url) && @selectors.key?(:link)
154
+
155
+ raise InvalidSelectorName, 'You must either use "url" or "link" your selectors. Using both is not supported.'
156
+ end
157
+
158
+ def fix_url_and_link!
159
+ return if @selectors[:url] || !@selectors.key?(:link)
160
+
161
+ @selectors = @selectors.dup
162
+ @selectors[:url] = @selectors[:link]
163
+ end
164
+
165
+ def handle_renamed_attributes!
166
+ RENAMED_ATTRIBUTES.each_pair do |new_name, old_names|
167
+ old_names.each do |old_name|
168
+ next unless @selectors.key?(old_name)
169
+
170
+ Html2rss::Log.warn("Selector '#{old_name}' is deprecated. Please rename to '#{new_name}'.")
171
+ @selectors[new_name] ||= @selectors.delete(old_name)
172
+ end
173
+ end
174
+ end
175
+
176
+ def parsed_body
177
+ parsed_body_for(response)
178
+ end
179
+
180
+ def parsed_body_for(page_response)
181
+ @parsed_bodies ||= {}
182
+ @parsed_bodies[page_response.url] ||= if page_response.json_response?
183
+ fragment = ObjectToXmlConverter.new(page_response.parsed_body).call
184
+ Nokogiri::HTML5.fragment(fragment)
185
+ else
186
+ page_response.parsed_body
187
+ end
188
+ end
189
+
190
+ def select_special(name, item:, config:, base_url:)
191
+ case name
192
+ when :enclosure
193
+ enclosure(item:, config:, base_url:)
194
+ when :guid
195
+ Array(config).map { |selector_name| select(selector_name, item, base_url:) }
196
+ when :categories
197
+ select_categories(category_selectors: config, item:, base_url:)
198
+ end
199
+ end
200
+
201
+ def select_regular(_name, item:, config:, base_url:)
202
+ value = Extractors.get(config.merge(channel: channel_context(base_url)), item)
203
+
204
+ if value && (post_process_steps = config[:post_process])
205
+ steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
206
+ value = post_process(item, value, steps, base_url:)
207
+ end
208
+
209
+ value
210
+ end
211
+
212
+ def post_process(item, value, post_process_steps, base_url:)
213
+ post_process_steps.each do |options|
214
+ context = Context.new(config: { channel: { url: base_url, time_zone: @time_zone } },
215
+ item:, scraper: self, options:)
216
+
217
+ value = PostProcessors.get(options[:name], value, context)
218
+ end
219
+
220
+ value
221
+ end
222
+
223
+ def select_categories(category_selectors:, item:, base_url:)
224
+ Array(category_selectors).flat_map do |selector_name|
225
+ extract_category_values(selector_name, item:, base_url:)
226
+ end
227
+ end
228
+
229
+ def extract_category_values(selector_name, item:, base_url:)
230
+ selector_key, config = selector_config_for(selector_name, allow_nil: true)
231
+ return [] unless config
232
+
233
+ nodes = extract_nodes(item:, config:)
234
+ unless node_set_with_multiple_elements?(nodes)
235
+ return Array(select_regular(selector_key, item:, config:, base_url:))
236
+ end
237
+
238
+ Array(nodes).flat_map { |node| extract_categories_from_node(node, item:, config:, base_url:) }
239
+ end
240
+
241
+ def extract_categories_from_node(node, item:, config:, base_url:)
242
+ values = Extractors.get(category_node_options(config, base_url:), node)
243
+ values = apply_post_process_steps(item:, value: values, post_process_steps: config[:post_process], base_url:)
244
+
245
+ Array(values).filter_map { |category| extract_category_text(category) }
246
+ end
247
+
248
+ def extract_category_text(category)
249
+ text = case category
250
+ when Nokogiri::XML::Node, Nokogiri::XML::NodeSet
251
+ HtmlExtractor.extract_visible_text(category)
252
+ else
253
+ category&.to_s
254
+ end
255
+
256
+ stripped = text&.strip
257
+ stripped unless stripped.nil? || stripped.empty?
258
+ end
259
+
260
+ def node_set_with_multiple_elements?(nodes)
261
+ nodes.is_a?(Nokogiri::XML::NodeSet) && nodes.length > 1
262
+ end
263
+
264
+ def category_node_options(selector_config, base_url:)
265
+ selector_config.merge(channel: channel_context(base_url), selector: nil)
266
+ end
267
+
268
+ def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
269
+ return value unless value && post_process_steps
270
+
271
+ steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
272
+ post_process(item, value, steps, base_url:)
273
+ end
274
+
275
+ def selector_config_for(name, allow_nil: false)
276
+ selector_key = name.to_sym
277
+
278
+ return [selector_key, @selectors[selector_key]] if @selectors.key?(selector_key)
279
+ return [selector_key, nil] if allow_nil
280
+
281
+ raise InvalidSelectorName, "Selector for '#{selector_key}' is not defined."
282
+ end
283
+
284
+ def extract_nodes(item:, config:)
285
+ return unless config.respond_to?(:[]) && config[:selector]
286
+
287
+ Extractors.element(item, config[:selector])
288
+ end
289
+
290
+ def channel_context(base_url)
291
+ { url: base_url, time_zone: @time_zone }
292
+ end
293
+
294
+ # @return [Hash] enclosure details.
295
+ def enclosure(item:, config:, base_url:)
296
+ url = Url.from_relative(select_regular(:enclosure, item:, config:, base_url:), base_url)
297
+
298
+ { url:, type: config[:content_type] }
299
+ end
300
+ end
301
+ end
@@ -0,0 +1,266 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable/uri'
4
+ require 'cgi'
5
+
6
+ module Html2rss
7
+ ##
8
+ # A value object representing a resolved, absolute URL with built-in operations.
9
+ # Provides URL resolution, sanitization, and titleization capabilities.
10
+ #
11
+ # @example Creating a URL from a relative path
12
+ # url = Url.from_relative('/path/to/article', 'https://example.com')
13
+ # url.to_s # => "https://example.com/path/to/article"
14
+ #
15
+ # @example Sanitizing a raw URL string
16
+ # url = Url.sanitize('https://example.com/ ')
17
+ # url.to_s # => "https://example.com/"
18
+ #
19
+ # @example Getting titleized versions
20
+ # url = Url.from_relative('/foo-bar/baz.txt', 'https://example.com')
21
+ # url.titleized # => "Foo Bar Baz"
22
+ # url.channel_titleized # => "example.com: Foo Bar Baz"
23
+ class Url
24
+ include Comparable
25
+
26
+ # Regular expression for basic URI format validation
27
+ URI_REGEXP = Addressable::URI::URIREGEX
28
+ # Schemes accepted by channel URL validation.
29
+ SUPPORTED_SCHEMES = %w[http https].to_set.freeze
30
+
31
+ ##
32
+ # Creates a URL from a relative path and base URL.
33
+ #
34
+ # @param relative_url [String, Html2rss::Url] the relative URL to resolve
35
+ # @param base_url [String, Html2rss::Url] the base URL to resolve against
36
+ # @return [Url] the resolved absolute URL
37
+ # @raise [ArgumentError] if the URL cannot be parsed
38
+ def self.from_relative(relative_url, base_url)
39
+ url = Addressable::URI.parse(relative_url.to_s.strip)
40
+ return new(url) if url.absolute?
41
+
42
+ base_uri = Addressable::URI.parse(base_url.to_s)
43
+ base_uri.path = '/' if base_uri.path.empty?
44
+
45
+ new(base_uri.join(url).normalize)
46
+ end
47
+
48
+ ##
49
+ # Creates a URL by sanitizing a raw URL string.
50
+ # Removes spaces and extracts the first valid URL from the string.
51
+ #
52
+ # @param raw_url [String] the raw URL string to sanitize
53
+ # @return [Url, nil] the sanitized URL, or nil if no valid URL found
54
+ def self.sanitize(raw_url)
55
+ matched_urls = raw_url.to_s.scan(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
56
+ url = matched_urls.first.to_s.strip
57
+ return nil if url.empty?
58
+
59
+ new(Addressable::URI.parse(url).normalize)
60
+ end
61
+
62
+ ##
63
+ # Creates a URL from an already-absolute URL string.
64
+ #
65
+ # @param url_string [String, Html2rss::Url] the absolute URL to parse
66
+ # @return [Url] the parsed and normalized URL
67
+ # @raise [ArgumentError] if the URL is not absolute or cannot be parsed
68
+ def self.from_absolute(url_string)
69
+ return url_string if url_string.is_a?(self)
70
+
71
+ url = new(Addressable::URI.parse(url_string.to_s.strip).normalize)
72
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
73
+
74
+ url
75
+ rescue Addressable::URI::InvalidURIError
76
+ raise ArgumentError, 'URL must be absolute'
77
+ end
78
+
79
+ ##
80
+ # Creates a URL for channel use with validation.
81
+ # Validates that the URL meets channel requirements (absolute, no @, supported schemes).
82
+ #
83
+ # @param url_string [String] the URL string to validate and parse
84
+ # @return [Url] the validated and parsed URL
85
+ # @raise [ArgumentError] if the URL doesn't meet channel requirements
86
+ # @example Creating a channel URL
87
+ # Url.for_channel('https://example.com')
88
+ # # => #<Html2rss::Url:... @uri=#<Addressable::URI:... URI:https://example.com>>
89
+ # @example Invalid channel URL
90
+ # Url.for_channel('/relative/path')
91
+ # # => raises ArgumentError: "URL must be absolute"
92
+ def self.for_channel(url_string)
93
+ return nil if url_string.nil? || url_string.empty?
94
+
95
+ stripped = url_string.strip
96
+ return nil if stripped.empty?
97
+
98
+ url = from_absolute(stripped)
99
+ validate_channel_url(url)
100
+ url
101
+ end
102
+
103
+ ##
104
+ # Validates that a URL meets channel requirements.
105
+ #
106
+ # @param url [Url] the URL to validate
107
+ # @raise [ArgumentError] if the URL doesn't meet channel requirements
108
+ def self.validate_channel_url(url)
109
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
110
+
111
+ uri = Addressable::URI.parse(url.to_s)
112
+ has_forbidden_at = uri.user || uri.password
113
+ has_forbidden_at ||= [uri.query, uri.fragment].compact.any? { |value| value.include?('@') }
114
+ raise ArgumentError, 'URL must not contain an @ character' if has_forbidden_at
115
+
116
+ scheme = url.scheme
117
+ raise ArgumentError, "URL scheme '#{scheme}' is not supported" unless SUPPORTED_SCHEMES.include?(scheme)
118
+ end
119
+
120
+ private_class_method :validate_channel_url
121
+
122
+ ##
123
+ # @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
124
+ def initialize(uri)
125
+ @uri = uri.freeze
126
+ freeze
127
+ end
128
+
129
+ # @return [String] normalized URL string
130
+ def to_s = @uri.to_s
131
+
132
+ # @return [String, nil] URI scheme, for example `http` or `https`
133
+ def scheme = @uri.scheme
134
+
135
+ # @return [String, nil] URI host component
136
+ def host = @uri.host
137
+
138
+ # @return [Integer, nil] URI port component
139
+ def port = @uri.port
140
+
141
+ # @return [String, nil] URI path component
142
+ def path = @uri.path
143
+
144
+ # @return [String, nil] URI query string without leading `?`
145
+ def query = @uri.query
146
+
147
+ # @return [String, nil] URI fragment without leading `#`
148
+ def fragment = @uri.fragment
149
+
150
+ # @return [Boolean] whether the URL includes scheme and host
151
+ def absolute? = @uri.absolute?
152
+
153
+ ##
154
+ # Returns the URL query string as a hash of string keys and values.
155
+ #
156
+ # @return [Hash{String => String}] normalized query parameters
157
+ def query_values = @uri.query_values(Hash) || {}
158
+
159
+ ##
160
+ # Returns the URL path split into non-empty segments.
161
+ #
162
+ # @return [Array<String>] normalized path segments
163
+ def path_segments = @uri.path.to_s.split('/').reject(&:empty?)
164
+
165
+ ##
166
+ # Returns a copy of the URL with the provided path.
167
+ #
168
+ # @param path [String] normalized absolute path
169
+ # @return [Url] a new URL with the updated path
170
+ def with_path(path)
171
+ uri = @uri.dup
172
+ uri.path = path
173
+ self.class.from_absolute(uri.normalize.to_s)
174
+ end
175
+
176
+ ##
177
+ # Returns a copy of the URL with the provided query values.
178
+ #
179
+ # @param values [Hash{String, Symbol => #to_s}] query parameters to assign
180
+ # @return [Url] a new URL with the updated query string
181
+ def with_query_values(values)
182
+ uri = @uri.dup
183
+ uri.query_values = values.transform_keys(&:to_s).transform_values(&:to_s)
184
+ self.class.from_absolute(uri.normalize.to_s)
185
+ end
186
+
187
+ ##
188
+ # Returns a titleized representation of the URL path.
189
+ # Converts the path to a human-readable title by cleaning and capitalizing words.
190
+ # Removes file extensions and special characters, then capitalizes each word.
191
+ #
192
+ # @return [String] the titleized path, or empty string if path is empty
193
+ # @example Basic titleization
194
+ # url = Url.from_absolute('https://example.com/foo-bar/baz.txt')
195
+ # url.titleized # => "Foo Bar Baz"
196
+ # @example With URL encoding
197
+ # url = Url.from_absolute('https://example.com/hello%20world/article.html')
198
+ # url.titleized # => "Hello World Article"
199
+ def titleized
200
+ path = @uri.path
201
+ return '' if path.empty?
202
+
203
+ nicer_path = CGI.unescapeURIComponent(path)
204
+ .split('/')
205
+ .flat_map do |part|
206
+ part.gsub(/[^a-zA-Z0-9.]/, ' ').gsub(/\s+/, ' ').split
207
+ end
208
+
209
+ nicer_path.map!(&:capitalize)
210
+ File.basename(nicer_path.join(' '), '.*')
211
+ end
212
+
213
+ ##
214
+ # Returns a titleized representation of the URL with prefixed host.
215
+ # Creates a channel title by combining host and path information.
216
+ # Useful for RSS channel titles that need to identify the source.
217
+ #
218
+ # @return [String] the titleized channel URL
219
+ # @example With path
220
+ # url = Url.from_absolute('https://example.com/foo-bar/baz')
221
+ # url.channel_titleized # => "example.com: Foo Bar Baz"
222
+ # @example Without path (root URL)
223
+ # url = Url.from_absolute('https://example.com')
224
+ # url.channel_titleized # => "example.com"
225
+ def channel_titleized
226
+ nicer_path = CGI.unescapeURIComponent(@uri.path).split('/').reject(&:empty?)
227
+ host = @uri.host
228
+
229
+ nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
230
+ end
231
+
232
+ ##
233
+ # Compares this URL with another URL for equality.
234
+ # URLs are considered equal if their string representations are the same.
235
+ #
236
+ # @param other [Url] the other URL to compare with
237
+ # @return [Integer] -1, 0, or 1 for less than, equal, or greater than
238
+ def <=>(other) = to_s <=> other.to_s
239
+
240
+ ##
241
+ # Returns true if this URL is equal to another URL.
242
+ #
243
+ # @param other [Object] the other object to compare with
244
+ # @return [Boolean] true if the URLs are equal
245
+ def ==(other) = other.is_a?(Url) && to_s == other.to_s
246
+
247
+ ##
248
+ # Supports hash-based comparisons by ensuring equality semantics match `hash`.
249
+ #
250
+ # @param other [Object] the other object to compare with
251
+ # @return [Boolean] true if the URLs are considered equal
252
+ def eql?(other) = other.is_a?(Url) && to_s == other.to_s
253
+
254
+ ##
255
+ # Returns the hash code for this URL.
256
+ #
257
+ # @return [Integer] the hash code
258
+ def hash = to_s.hash
259
+
260
+ ##
261
+ # Returns a string representation of the URL for debugging.
262
+ #
263
+ # @return [String] the debug representation
264
+ def inspect = "#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
265
+ end
266
+ end
@@ -3,6 +3,7 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.17.0'
6
+ # Current application version.
7
+ VERSION = '0.19.0'
7
8
  public_constant :VERSION
8
9
  end