html2rss 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
@@ -22,6 +22,9 @@ module Html2rss
22
22
  content_type.first&.to_s || 'application/octet-stream'
23
23
  end
24
24
 
25
+ # @param enclosure [Html2rss::RssBuilder::Enclosure, nil] built enclosure object for the current RSS item
26
+ # @param maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
27
+ # @return [void]
25
28
  def self.add(enclosure, maker)
26
29
  return unless enclosure
27
30
 
@@ -32,6 +35,9 @@ module Html2rss
32
35
  end
33
36
  end
34
37
 
38
+ # @param url [Html2rss::Url] absolute enclosure URL
39
+ # @param type [String, nil] optional enclosure MIME type
40
+ # @param bits_length [Integer] enclosure byte length (historical name)
35
41
  def initialize(url:, type: nil, bits_length: 0)
36
42
  raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
37
43
 
@@ -40,9 +46,17 @@ module Html2rss
40
46
  @bits_length = bits_length
41
47
  end
42
48
 
49
+ # @return [String] explicit MIME type or one inferred from URL extension
43
50
  def type = @type || self.class.guess_content_type_from_url(url)
44
51
 
45
- attr_reader :bits_length, :url
52
+ # @return [Integer] enclosure length in bytes
53
+ def bytes_length = @bits_length
54
+
55
+ # @return [Integer] enclosure length in bytes (legacy reader name)
56
+ def bits_length = bytes_length
57
+
58
+ # @return [Html2rss::Url] absolute enclosure URL
59
+ attr_reader :url
46
60
  end
47
61
  end
48
62
  end
@@ -35,8 +35,12 @@ module Html2rss
35
35
  end
36
36
  end
37
37
 
38
+ # Allowed stylesheet MIME types for RSS processing instructions.
38
39
  TYPES = ['text/css', 'text/xsl'].to_set.freeze
39
40
 
41
+ # @param href [String] stylesheet URL
42
+ # @param type [String] MIME type (`text/css` or `text/xsl`)
43
+ # @param media [String] media query hint for the stylesheet
40
44
  def initialize(href:, type:, media: 'all')
41
45
  raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
42
46
  raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
@@ -7,6 +7,9 @@ module Html2rss
7
7
  # Builds an RSS Feed by providing channel, articles and stylesheets.
8
8
  class RssBuilder
9
9
  class << self
10
+ # @param article [Html2rss::RssBuilder::Article] source article
11
+ # @param item_maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
12
+ # @return [void]
10
13
  def add_item(article, item_maker)
11
14
  add_item_string_values(article, item_maker)
12
15
  add_item_categories(article, item_maker)
@@ -50,6 +53,7 @@ module Html2rss
50
53
  @stylesheets = stylesheets
51
54
  end
52
55
 
56
+ # @return [RSS::Rss] RSS 2.0 document instance
53
57
  def call
54
58
  RSS::Maker.make('2.0') do |maker|
55
59
  Stylesheet.add(maker, stylesheets)
@@ -7,6 +7,7 @@ module Html2rss
7
7
  ##
8
8
  # Validates the configuration hash for :selectors.
9
9
  class Config < Dry::Validation::Contract
10
+ # Required wrapper key used to validate dynamic selector names.
10
11
  NESTING_KEY = :dynamic_keys_workaround
11
12
 
12
13
  ##
@@ -32,6 +32,8 @@ module Html2rss
32
32
  #
33
33
  # @param xml [Nokogiri::XML::Element]
34
34
  # @param options [Options]
35
+ # @option options [String] :selector CSS selector used to find the element
36
+ # @option options [String] :attribute attribute name to extract from the selected element
35
37
  def initialize(xml, options)
36
38
  @options = options
37
39
  @element = Extractors.element(xml, options.selector)
@@ -32,6 +32,8 @@ module Html2rss
32
32
  #
33
33
  # @param xml [Nokogiri::XML::Element]
34
34
  # @param options [Options]
35
+ # @option options [String] :selector CSS selector used to find the link element
36
+ # @option options [Hash{Symbol => Object}] :channel channel configuration, including :url
35
37
  def initialize(xml, options)
36
38
  @options = options
37
39
  @element = Extractors.element(xml, options.selector)
@@ -31,6 +31,7 @@ module Html2rss
31
31
  #
32
32
  # @param xml [Nokogiri::XML::Element]
33
33
  # @param options [Options]
34
+ # @option options [String] :selector CSS selector used to find the element
34
35
  def initialize(xml, options)
35
36
  @element = Extractors.element(xml, options.selector)
36
37
  end
@@ -9,7 +9,7 @@ module Html2rss
9
9
  # Example usage in YAML:
10
10
  #
11
11
  # selectors:
12
- # author:
12
+ # byline:
13
13
  # extractor: static
14
14
  # static: Foobar
15
15
  #
@@ -24,6 +24,7 @@ module Html2rss
24
24
  #
25
25
  # @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
26
26
  # @param options [Options] Options containing the static value.
27
+ # @option options [String, Symbol] :static static value returned by this extractor
27
28
  def initialize(_xml, options)
28
29
  @options = options
29
30
  end
@@ -29,6 +29,7 @@ module Html2rss
29
29
  #
30
30
  # @param xml [Nokogiri::XML::Element]
31
31
  # @param options [Options]
32
+ # @option options [String] :selector CSS selector used to find the element
32
33
  def initialize(xml, options)
33
34
  @element = Extractors.element(xml, options.selector)
34
35
  end
@@ -23,6 +23,7 @@ module Html2rss
23
23
  hash[klass] = klass.const_get(:Options)
24
24
  end
25
25
 
26
+ # Extractor used when none is explicitly configured.
26
27
  DEFAULT_EXTRACTOR = :text
27
28
 
28
29
  class << self
@@ -36,7 +37,7 @@ module Html2rss
36
37
  selector ? xml.css(selector) : xml
37
38
  end
38
39
 
39
- # @param attribute_options [Hash<Symbol, Object>]
40
+ # @param attribute_options [Hash{Symbol => Object}]
40
41
  # Should contain at least `:extractor` (the name) and required options for that extractor.
41
42
  # @param xml [Nokogiri::XML::Document]
42
43
  # @return [Object] instance of the specified item extractor class
@@ -7,6 +7,7 @@ module Html2rss
7
7
  ##
8
8
  # A naive implementation of "Object to XML": converts a Ruby object to XML format.
9
9
  class ObjectToXmlConverter
10
+ # Wrapper tags used for top-level collection conversion.
10
11
  OBJECT_TO_XML_TAGS = {
11
12
  hash: ['<object>', '</object>'],
12
13
  array: ['<array>', '</array>']
@@ -9,7 +9,8 @@ module Html2rss
9
9
  # Validates the presence of required options in the context
10
10
  #
11
11
  # @param keys [Array<Symbol>] the keys to check for presence
12
- # @param context [Hash] the context containing options
12
+ # @param context [Selectors::Context] the context containing options
13
+ # @return [void]
13
14
  # @raise [MissingOption] if any key is missing
14
15
  def self.expect_options(keys, context)
15
16
  keys.each do |key|
@@ -25,13 +26,14 @@ module Html2rss
25
26
  # @param value [Object] the value to check
26
27
  # @param types [Array<Class>, Class] the expected type(s)
27
28
  # @param name [String] the name of the option being checked
28
- # @param context [Selectors::Context] the context
29
+ # @param context [Selectors::Context] call-site context used for richer validation errors
30
+ # @return [void]
29
31
  # @raise [InvalidType] if the value is not of the expected type(s)
30
32
  def self.assert_type(value, types = [], name, context:)
31
33
  return if Array(types).any? { |type| value.is_a?(type) }
32
34
 
33
- options = if context.is_a?(Hash)
34
- context[:options]
35
+ options = if context.respond_to?(:options)
36
+ context.options
35
37
  else
36
38
  { file: File.basename(caller(1, 1).first.split(':').first) }
37
39
  end
@@ -42,6 +44,10 @@ module Html2rss
42
44
 
43
45
  ##
44
46
  # This method validates the arguments passed to the post processor. Must be implemented by subclasses.
47
+ #
48
+ # @param _value [Object] extracted selector value
49
+ # @param _context [Selectors::Context] post-processor execution context
50
+ # @return [void]
45
51
  def self.validate_args!(_value, _context)
46
52
  raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
47
53
  end
@@ -49,11 +55,10 @@ module Html2rss
49
55
  # Initializes the post processor
50
56
  #
51
57
  # @param value [Object] the value to be processed
52
- # @param context [Selectors::Context] the context
58
+ # @param context [Selectors::Context] runtime selector context and options
53
59
  def initialize(value, context)
54
60
  klass = self.class
55
- # TODO: get rid of Hash
56
- klass.assert_type(context, [Selectors::Context, Hash], 'context', context:)
61
+ klass.assert_type(context, Selectors::Context, 'context', context:)
57
62
  klass.validate_args!(value, context)
58
63
 
59
64
  @value = value
@@ -64,6 +69,7 @@ module Html2rss
64
69
 
65
70
  # Abstract method to be implemented by subclasses
66
71
  #
72
+ # @return [Object] transformed value
67
73
  # @raise [NotImplementedError] if not implemented in subclass
68
74
  def get
69
75
  raise NotImplementedError, 'You must implement the `get` method in the post processor'
@@ -29,6 +29,9 @@ module Html2rss
29
29
  #
30
30
  # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
31
31
  class Gsub < Base
32
+ # @param value [String] extracted selector value
33
+ # @param context [Selectors::Context] post-processor context
34
+ # @return [void]
32
35
  def self.validate_args!(value, context)
33
36
  assert_type value, String, :value, context:
34
37
  expect_options(%i[replacement pattern], context)
@@ -28,6 +28,9 @@ module Html2rss
28
28
  # Would return:
29
29
  # 'Lorem **ipsum** dolor'
30
30
  class HtmlToMarkdown < Base
31
+ # @param value [String] extracted selector value
32
+ # @param context [Selectors::Context] post-processor context
33
+ # @return [void]
31
34
  def self.validate_args!(value, context)
32
35
  assert_type value, String, :value, context:
33
36
  end
@@ -3,10 +3,12 @@
3
3
  module Html2rss
4
4
  class Selectors
5
5
  module PostProcessors
6
+ # HTML tree transformers used by selectors post-processing.
6
7
  module HtmlTransformers
7
8
  ##
8
9
  # Transformer that converts relative URLs to absolute URLs within specified HTML elements.
9
10
  class TransformUrlsToAbsoluteOnes
11
+ # HTML tags and the URL-bearing attribute that should be normalized.
10
12
  URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
11
13
  'a' => :href, # Visible link
12
14
  'img' => :src, # Visible image
@@ -15,12 +17,19 @@ module Html2rss
15
17
  'video' => :src # Video player is visible
16
18
  }.freeze
17
19
 
20
+ # @param channel_url [String, Html2rss::Url] base URL used to resolve relative links
18
21
  def initialize(channel_url)
19
22
  @channel_url = channel_url
20
23
  end
21
24
 
22
25
  ##
23
26
  # Transforms URLs to absolute ones.
27
+ #
28
+ # @param node_name [String] node name currently being transformed
29
+ # @param node [Nokogiri::XML::Node] node currently being transformed
30
+ # @param _env [Hash] transformer context
31
+ # @option _env [Object] :_reserved reserved for transformer pipeline context
32
+ # @return [void]
24
33
  def call(node_name:, node:, **_env)
25
34
  return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
26
35
 
@@ -12,6 +12,8 @@ module Html2rss
12
12
  #
13
13
  # @param node_name [String]
14
14
  # @param node [Nokogiri::XML::Node]
15
+ # @param _env [Hash] transformer context
16
+ # @option _env [Object] :_reserved reserved for transformer pipeline context
15
17
  # @return [nil]
16
18
  def call(node_name:, node:, **_env)
17
19
  return unless should_process?(node_name)
@@ -19,10 +21,14 @@ module Html2rss
19
21
  wrap_image_in_anchor(node) unless already_wrapped?(node)
20
22
  end
21
23
 
24
+ # @param node_name [String] node name currently being transformed
25
+ # @return [Boolean] whether this transformer should run for the node
22
26
  def should_process?(node_name)
23
27
  node_name == 'img'
24
28
  end
25
29
 
30
+ # @param node [Nokogiri::XML::Node] node currently being transformed
31
+ # @return [Boolean] whether the image is already wrapped in a link
26
32
  def already_wrapped?(node)
27
33
  node.parent.name == 'a'
28
34
  end
@@ -34,6 +34,9 @@ module Html2rss
34
34
  #
35
35
  # <p>Price: 12.34</p>
36
36
  class MarkdownToHtml < Base
37
+ # @param value [String] extracted selector value
38
+ # @param context [Selectors::Context] post-processor context
39
+ # @return [void]
37
40
  def self.validate_args!(value, context)
38
41
  assert_type value, String, :value, context:
39
42
  end
@@ -27,6 +27,9 @@ module Html2rss
27
27
  #
28
28
  # It uses `Time.parse`.
29
29
  class ParseTime < Base
30
+ # @param value [String] extracted selector value
31
+ # @param context [Selectors::Context] post-processor context
32
+ # @return [void]
30
33
  def self.validate_args!(value, context)
31
34
  assert_type(value, String, :value, context:)
32
35
  time_zone_value = time_zone(context)
@@ -38,6 +41,8 @@ module Html2rss
38
41
  assert_type(time_zone_value, String, :time_zone, context:)
39
42
  end
40
43
 
44
+ # @param context [Selectors::Context] post-processor context
45
+ # @return [String, nil] configured channel time zone
41
46
  def self.time_zone(context) = context.dig(:config, :channel, :time_zone)
42
47
 
43
48
  ##
@@ -23,6 +23,9 @@ module Html2rss
23
23
  # Would return:
24
24
  # 'http://why-not-use-a-link.uh'
25
25
  class ParseUri < Base
26
+ # @param value [String] extracted selector value
27
+ # @param _context [Selectors::Context] post-processor context
28
+ # @return [void]
26
29
  def self.validate_args!(value, _context)
27
30
  raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
28
31
  end
@@ -83,6 +83,9 @@ module Html2rss
83
83
  'preload' => 'none'
84
84
  }
85
85
  }.freeze
86
+ # @param value [String] extracted selector value
87
+ # @param context [Selectors::Context] post-processor context
88
+ # @return [void]
86
89
  def self.validate_args!(value, context)
87
90
  assert_type value, String, :value, context:
88
91
  end
@@ -95,7 +98,8 @@ module Html2rss
95
98
  def self.get(html, url)
96
99
  return nil if String(html).empty?
97
100
 
98
- new(html, config: { channel: { url: } }).get
101
+ context = Selectors::Context.new(config: { channel: { url: } }, options: {})
102
+ new(html, context).get
99
103
  end
100
104
 
101
105
  ##
@@ -30,6 +30,9 @@ module Html2rss
30
30
  # Would return:
31
31
  # 'bar'
32
32
  class Substring < Base
33
+ # @param value [String] extracted selector value
34
+ # @param context [Selectors::Context] post-processor context
35
+ # @return [void]
33
36
  def self.validate_args!(value, context)
34
37
  assert_type value, String, :value, context:
35
38
 
@@ -34,6 +34,9 @@ module Html2rss
34
34
  # Would return:
35
35
  # 'Product (23,42€)'
36
36
  class Template < Base
37
+ # @param value [String] extracted selector value
38
+ # @param context [Selectors::Context] post-processor context
39
+ # @return [void]
37
40
  def self.validate_args!(value, context)
38
41
  assert_type value, String, :value, context:
39
42
 
@@ -34,6 +34,11 @@ module Html2rss
34
34
 
35
35
  ##
36
36
  # Shorthand method to instantiate the post processor and call `#get` on it
37
+ #
38
+ # @param name [String, Symbol] post-processor name from selector config
39
+ # @param value [Object] extracted selector value
40
+ # @param context [Selectors::Context] post-processor context
41
+ # @return [Object] transformed selector value
37
42
  def self.get(name, value, context)
38
43
  klass = NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Unknown name '#{name}'")
39
44
  klass.new(value, context).get
@@ -20,10 +20,14 @@ module Html2rss
20
20
  # A context instance passed to item extractors and post-processors.
21
21
  Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
22
22
 
23
+ # Default selectors options merged into user configuration.
23
24
  DEFAULT_CONFIG = { items: { enhance: true } }.freeze
24
25
 
26
+ # Selector key that points to the root list of article nodes.
25
27
  ITEMS_SELECTOR_KEY = :items
28
+ # Supported RSS item attributes extractable through selectors.
26
29
  ITEM_TAGS = %i[title url description author comments published_at guid enclosure categories].freeze
30
+ # Item attributes that require dedicated extraction logic.
27
31
  SPECIAL_ATTRIBUTES = Set[:guid, :enclosure, :categories].freeze
28
32
 
29
33
  # Mapping of new attribute names to their legacy names for backward compatibility.
@@ -85,6 +89,7 @@ module Html2rss
85
89
  # Extracts an article hash for a given item element.
86
90
  #
87
91
  # @param item [Nokogiri::XML::Element] The element to extract from.
92
+ # @param page_response [RequestService::Response] response used for selector extraction context
88
93
  # @return [Hash] Hash of attributes for the article.
89
94
  def extract_article(item, page_response = response)
90
95
  @rss_item_attributes.to_h { |key| [key, select(key, item, base_url: page_response.url)] }.compact
@@ -96,6 +101,7 @@ module Html2rss
96
101
  #
97
102
  # @param article_hash [Hash] The original article hash.
98
103
  # @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
104
+ # @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
99
105
  # @return [Hash] The enhanced article hash.
100
106
  def enhance_article_hash(article_hash, article_tag, base_url = @url)
101
107
  selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
@@ -116,6 +122,7 @@ module Html2rss
116
122
  #
117
123
  # @param name [Symbol, String] Name of the attribute.
118
124
  # @param item [Nokogiri::XML::Element] The HTML element to process.
125
+ # @param base_url [String, Html2rss::Url] base URL for relative extraction values
119
126
  # @return [Object, Array<Object>] The selected value(s).
120
127
  # @raise [InvalidSelectorName] If the attribute name is invalid or not defined.
121
128
  def select(name, item, base_url: @url)
data/lib/html2rss/url.rb CHANGED
@@ -25,6 +25,7 @@ module Html2rss
25
25
 
26
26
  # Regular expression for basic URI format validation
27
27
  URI_REGEXP = Addressable::URI::URIREGEX
28
+ # Schemes accepted by channel URL validation.
28
29
  SUPPORTED_SCHEMES = %w[http https].to_set.freeze
29
30
 
30
31
  ##
@@ -107,7 +108,10 @@ module Html2rss
107
108
  def self.validate_channel_url(url)
108
109
  raise ArgumentError, 'URL must be absolute' unless url.absolute?
109
110
 
110
- raise ArgumentError, 'URL must not contain an @ character' if url.to_s.include?('@')
111
+ uri = Addressable::URI.parse(url.to_s)
112
+ has_forbidden_at = uri.user || uri.password
113
+ has_forbidden_at ||= [uri.query, uri.fragment].compact.any? { |value| value.include?('@') }
114
+ raise ArgumentError, 'URL must not contain an @ character' if has_forbidden_at
111
115
 
112
116
  scheme = url.scheme
113
117
  raise ArgumentError, "URL scheme '#{scheme}' is not supported" unless SUPPORTED_SCHEMES.include?(scheme)
@@ -122,31 +126,41 @@ module Html2rss
122
126
  freeze
123
127
  end
124
128
 
125
- # Delegate common URI operations to the underlying URI
129
+ # @return [String] normalized URL string
126
130
  def to_s = @uri.to_s
131
+
132
+ # @return [String, nil] URI scheme, for example `http` or `https`
127
133
  def scheme = @uri.scheme
134
+
135
+ # @return [String, nil] URI host component
128
136
  def host = @uri.host
137
+
138
+ # @return [Integer, nil] URI port component
129
139
  def port = @uri.port
140
+
141
+ # @return [String, nil] URI path component
130
142
  def path = @uri.path
143
+
144
+ # @return [String, nil] URI query string without leading `?`
131
145
  def query = @uri.query
146
+
147
+ # @return [String, nil] URI fragment without leading `#`
132
148
  def fragment = @uri.fragment
149
+
150
+ # @return [Boolean] whether the URL includes scheme and host
133
151
  def absolute? = @uri.absolute?
134
152
 
135
153
  ##
136
154
  # Returns the URL query string as a hash of string keys and values.
137
155
  #
138
156
  # @return [Hash{String => String}] normalized query parameters
139
- def query_values
140
- @uri.query_values(Hash) || {}
141
- end
157
+ def query_values = @uri.query_values(Hash) || {}
142
158
 
143
159
  ##
144
160
  # Returns the URL path split into non-empty segments.
145
161
  #
146
162
  # @return [Array<String>] normalized path segments
147
- def path_segments
148
- @uri.path.to_s.split('/').reject(&:empty?)
149
- end
163
+ def path_segments = @uri.path.to_s.split('/').reject(&:empty?)
150
164
 
151
165
  ##
152
166
  # Returns a copy of the URL with the provided path.
@@ -221,42 +235,32 @@ module Html2rss
221
235
  #
222
236
  # @param other [Url] the other URL to compare with
223
237
  # @return [Integer] -1, 0, or 1 for less than, equal, or greater than
224
- def <=>(other)
225
- to_s <=> other.to_s
226
- end
238
+ def <=>(other) = to_s <=> other.to_s
227
239
 
228
240
  ##
229
241
  # Returns true if this URL is equal to another URL.
230
242
  #
231
243
  # @param other [Object] the other object to compare with
232
244
  # @return [Boolean] true if the URLs are equal
233
- def ==(other)
234
- other.is_a?(Url) && to_s == other.to_s
235
- end
245
+ def ==(other) = other.is_a?(Url) && to_s == other.to_s
236
246
 
237
247
  ##
238
248
  # Supports hash-based comparisons by ensuring equality semantics match `hash`.
239
249
  #
240
250
  # @param other [Object] the other object to compare with
241
251
  # @return [Boolean] true if the URLs are considered equal
242
- def eql?(other)
243
- other.is_a?(Url) && to_s == other.to_s
244
- end
252
+ def eql?(other) = other.is_a?(Url) && to_s == other.to_s
245
253
 
246
254
  ##
247
255
  # Returns the hash code for this URL.
248
256
  #
249
257
  # @return [Integer] the hash code
250
- def hash
251
- to_s.hash
252
- end
258
+ def hash = to_s.hash
253
259
 
254
260
  ##
255
261
  # Returns a string representation of the URL for debugging.
256
262
  #
257
263
  # @return [String] the debug representation
258
- def inspect
259
- "#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
260
- end
264
+ def inspect = "#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
261
265
  end
262
266
  end
@@ -3,6 +3,7 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.18.0'
6
+ # Current application version.
7
+ VERSION = '0.19.0'
7
8
  public_constant :VERSION
8
9
  end