html2rss 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
|
@@ -22,6 +22,9 @@ module Html2rss
|
|
|
22
22
|
content_type.first&.to_s || 'application/octet-stream'
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
+
# @param enclosure [Html2rss::RssBuilder::Enclosure, nil] built enclosure object for the current RSS item
|
|
26
|
+
# @param maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
|
|
27
|
+
# @return [void]
|
|
25
28
|
def self.add(enclosure, maker)
|
|
26
29
|
return unless enclosure
|
|
27
30
|
|
|
@@ -32,6 +35,9 @@ module Html2rss
|
|
|
32
35
|
end
|
|
33
36
|
end
|
|
34
37
|
|
|
38
|
+
# @param url [Html2rss::Url] absolute enclosure URL
|
|
39
|
+
# @param type [String, nil] optional enclosure MIME type
|
|
40
|
+
# @param bits_length [Integer] enclosure byte length (historical name)
|
|
35
41
|
def initialize(url:, type: nil, bits_length: 0)
|
|
36
42
|
raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
|
|
37
43
|
|
|
@@ -40,9 +46,17 @@ module Html2rss
|
|
|
40
46
|
@bits_length = bits_length
|
|
41
47
|
end
|
|
42
48
|
|
|
49
|
+
# @return [String] explicit MIME type or one inferred from URL extension
|
|
43
50
|
def type = @type || self.class.guess_content_type_from_url(url)
|
|
44
51
|
|
|
45
|
-
|
|
52
|
+
# @return [Integer] enclosure length in bytes
|
|
53
|
+
def bytes_length = @bits_length
|
|
54
|
+
|
|
55
|
+
# @return [Integer] enclosure length in bytes (legacy reader name)
|
|
56
|
+
def bits_length = bytes_length
|
|
57
|
+
|
|
58
|
+
# @return [Html2rss::Url] absolute enclosure URL
|
|
59
|
+
attr_reader :url
|
|
46
60
|
end
|
|
47
61
|
end
|
|
48
62
|
end
|
|
@@ -35,8 +35,12 @@ module Html2rss
|
|
|
35
35
|
end
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
+
# Allowed stylesheet MIME types for RSS processing instructions.
|
|
38
39
|
TYPES = ['text/css', 'text/xsl'].to_set.freeze
|
|
39
40
|
|
|
41
|
+
# @param href [String] stylesheet URL
|
|
42
|
+
# @param type [String] MIME type (`text/css` or `text/xsl`)
|
|
43
|
+
# @param media [String] media query hint for the stylesheet
|
|
40
44
|
def initialize(href:, type:, media: 'all')
|
|
41
45
|
raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
|
|
42
46
|
raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
|
data/lib/html2rss/rss_builder.rb
CHANGED
|
@@ -7,6 +7,9 @@ module Html2rss
|
|
|
7
7
|
# Builds an RSS Feed by providing channel, articles and stylesheets.
|
|
8
8
|
class RssBuilder
|
|
9
9
|
class << self
|
|
10
|
+
# @param article [Html2rss::RssBuilder::Article] source article
|
|
11
|
+
# @param item_maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
|
|
12
|
+
# @return [void]
|
|
10
13
|
def add_item(article, item_maker)
|
|
11
14
|
add_item_string_values(article, item_maker)
|
|
12
15
|
add_item_categories(article, item_maker)
|
|
@@ -50,6 +53,7 @@ module Html2rss
|
|
|
50
53
|
@stylesheets = stylesheets
|
|
51
54
|
end
|
|
52
55
|
|
|
56
|
+
# @return [RSS::Rss] RSS 2.0 document instance
|
|
53
57
|
def call
|
|
54
58
|
RSS::Maker.make('2.0') do |maker|
|
|
55
59
|
Stylesheet.add(maker, stylesheets)
|
|
@@ -32,6 +32,8 @@ module Html2rss
|
|
|
32
32
|
#
|
|
33
33
|
# @param xml [Nokogiri::XML::Element]
|
|
34
34
|
# @param options [Options]
|
|
35
|
+
# @option options [String] :selector CSS selector used to find the element
|
|
36
|
+
# @option options [String] :attribute attribute name to extract from the selected element
|
|
35
37
|
def initialize(xml, options)
|
|
36
38
|
@options = options
|
|
37
39
|
@element = Extractors.element(xml, options.selector)
|
|
@@ -32,6 +32,8 @@ module Html2rss
|
|
|
32
32
|
#
|
|
33
33
|
# @param xml [Nokogiri::XML::Element]
|
|
34
34
|
# @param options [Options]
|
|
35
|
+
# @option options [String] :selector CSS selector used to find the link element
|
|
36
|
+
# @option options [Hash{Symbol => Object}] :channel channel configuration, including :url
|
|
35
37
|
def initialize(xml, options)
|
|
36
38
|
@options = options
|
|
37
39
|
@element = Extractors.element(xml, options.selector)
|
|
@@ -31,6 +31,7 @@ module Html2rss
|
|
|
31
31
|
#
|
|
32
32
|
# @param xml [Nokogiri::XML::Element]
|
|
33
33
|
# @param options [Options]
|
|
34
|
+
# @option options [String] :selector CSS selector used to find the element
|
|
34
35
|
def initialize(xml, options)
|
|
35
36
|
@element = Extractors.element(xml, options.selector)
|
|
36
37
|
end
|
|
@@ -9,7 +9,7 @@ module Html2rss
|
|
|
9
9
|
# Example usage in YAML:
|
|
10
10
|
#
|
|
11
11
|
# selectors:
|
|
12
|
-
#
|
|
12
|
+
# byline:
|
|
13
13
|
# extractor: static
|
|
14
14
|
# static: Foobar
|
|
15
15
|
#
|
|
@@ -24,6 +24,7 @@ module Html2rss
|
|
|
24
24
|
#
|
|
25
25
|
# @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
|
|
26
26
|
# @param options [Options] Options containing the static value.
|
|
27
|
+
# @option options [String, Symbol] :static static value returned by this extractor
|
|
27
28
|
def initialize(_xml, options)
|
|
28
29
|
@options = options
|
|
29
30
|
end
|
|
@@ -29,6 +29,7 @@ module Html2rss
|
|
|
29
29
|
#
|
|
30
30
|
# @param xml [Nokogiri::XML::Element]
|
|
31
31
|
# @param options [Options]
|
|
32
|
+
# @option options [String] :selector CSS selector used to find the element
|
|
32
33
|
def initialize(xml, options)
|
|
33
34
|
@element = Extractors.element(xml, options.selector)
|
|
34
35
|
end
|
|
@@ -23,6 +23,7 @@ module Html2rss
|
|
|
23
23
|
hash[klass] = klass.const_get(:Options)
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
+
# Extractor used when none is explicitly configured.
|
|
26
27
|
DEFAULT_EXTRACTOR = :text
|
|
27
28
|
|
|
28
29
|
class << self
|
|
@@ -36,7 +37,7 @@ module Html2rss
|
|
|
36
37
|
selector ? xml.css(selector) : xml
|
|
37
38
|
end
|
|
38
39
|
|
|
39
|
-
# @param attribute_options [Hash
|
|
40
|
+
# @param attribute_options [Hash{Symbol => Object}]
|
|
40
41
|
# Should contain at least `:extractor` (the name) and required options for that extractor.
|
|
41
42
|
# @param xml [Nokogiri::XML::Document]
|
|
42
43
|
# @return [Object] instance of the specified item extractor class
|
|
@@ -7,6 +7,7 @@ module Html2rss
|
|
|
7
7
|
##
|
|
8
8
|
# A naive implementation of "Object to XML": converts a Ruby object to XML format.
|
|
9
9
|
class ObjectToXmlConverter
|
|
10
|
+
# Wrapper tags used for top-level collection conversion.
|
|
10
11
|
OBJECT_TO_XML_TAGS = {
|
|
11
12
|
hash: ['<object>', '</object>'],
|
|
12
13
|
array: ['<array>', '</array>']
|
|
@@ -9,7 +9,8 @@ module Html2rss
|
|
|
9
9
|
# Validates the presence of required options in the context
|
|
10
10
|
#
|
|
11
11
|
# @param keys [Array<Symbol>] the keys to check for presence
|
|
12
|
-
# @param context [
|
|
12
|
+
# @param context [Selectors::Context] the context containing options
|
|
13
|
+
# @return [void]
|
|
13
14
|
# @raise [MissingOption] if any key is missing
|
|
14
15
|
def self.expect_options(keys, context)
|
|
15
16
|
keys.each do |key|
|
|
@@ -25,13 +26,14 @@ module Html2rss
|
|
|
25
26
|
# @param value [Object] the value to check
|
|
26
27
|
# @param types [Array<Class>, Class] the expected type(s)
|
|
27
28
|
# @param name [String] the name of the option being checked
|
|
28
|
-
# @param context [Selectors::Context]
|
|
29
|
+
# @param context [Selectors::Context] call-site context used for richer validation errors
|
|
30
|
+
# @return [void]
|
|
29
31
|
# @raise [InvalidType] if the value is not of the expected type(s)
|
|
30
32
|
def self.assert_type(value, types = [], name, context:)
|
|
31
33
|
return if Array(types).any? { |type| value.is_a?(type) }
|
|
32
34
|
|
|
33
|
-
options = if context.
|
|
34
|
-
context
|
|
35
|
+
options = if context.respond_to?(:options)
|
|
36
|
+
context.options
|
|
35
37
|
else
|
|
36
38
|
{ file: File.basename(caller(1, 1).first.split(':').first) }
|
|
37
39
|
end
|
|
@@ -42,6 +44,10 @@ module Html2rss
|
|
|
42
44
|
|
|
43
45
|
##
|
|
44
46
|
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
|
47
|
+
#
|
|
48
|
+
# @param _value [Object] extracted selector value
|
|
49
|
+
# @param _context [Selectors::Context] post-processor execution context
|
|
50
|
+
# @return [void]
|
|
45
51
|
def self.validate_args!(_value, _context)
|
|
46
52
|
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
|
47
53
|
end
|
|
@@ -49,11 +55,10 @@ module Html2rss
|
|
|
49
55
|
# Initializes the post processor
|
|
50
56
|
#
|
|
51
57
|
# @param value [Object] the value to be processed
|
|
52
|
-
# @param context [Selectors::Context]
|
|
58
|
+
# @param context [Selectors::Context] runtime selector context and options
|
|
53
59
|
def initialize(value, context)
|
|
54
60
|
klass = self.class
|
|
55
|
-
|
|
56
|
-
klass.assert_type(context, [Selectors::Context, Hash], 'context', context:)
|
|
61
|
+
klass.assert_type(context, Selectors::Context, 'context', context:)
|
|
57
62
|
klass.validate_args!(value, context)
|
|
58
63
|
|
|
59
64
|
@value = value
|
|
@@ -64,6 +69,7 @@ module Html2rss
|
|
|
64
69
|
|
|
65
70
|
# Abstract method to be implemented by subclasses
|
|
66
71
|
#
|
|
72
|
+
# @return [Object] transformed value
|
|
67
73
|
# @raise [NotImplementedError] if not implemented in subclass
|
|
68
74
|
def get
|
|
69
75
|
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
|
@@ -29,6 +29,9 @@ module Html2rss
|
|
|
29
29
|
#
|
|
30
30
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
|
31
31
|
class Gsub < Base
|
|
32
|
+
# @param value [String] extracted selector value
|
|
33
|
+
# @param context [Selectors::Context] post-processor context
|
|
34
|
+
# @return [void]
|
|
32
35
|
def self.validate_args!(value, context)
|
|
33
36
|
assert_type value, String, :value, context:
|
|
34
37
|
expect_options(%i[replacement pattern], context)
|
|
@@ -28,6 +28,9 @@ module Html2rss
|
|
|
28
28
|
# Would return:
|
|
29
29
|
# 'Lorem **ipsum** dolor'
|
|
30
30
|
class HtmlToMarkdown < Base
|
|
31
|
+
# @param value [String] extracted selector value
|
|
32
|
+
# @param context [Selectors::Context] post-processor context
|
|
33
|
+
# @return [void]
|
|
31
34
|
def self.validate_args!(value, context)
|
|
32
35
|
assert_type value, String, :value, context:
|
|
33
36
|
end
|
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb
CHANGED
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
module Html2rss
|
|
4
4
|
class Selectors
|
|
5
5
|
module PostProcessors
|
|
6
|
+
# HTML tree transformers used by selectors post-processing.
|
|
6
7
|
module HtmlTransformers
|
|
7
8
|
##
|
|
8
9
|
# Transformer that converts relative URLs to absolute URLs within specified HTML elements.
|
|
9
10
|
class TransformUrlsToAbsoluteOnes
|
|
11
|
+
# HTML tags and the URL-bearing attribute that should be normalized.
|
|
10
12
|
URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
|
|
11
13
|
'a' => :href, # Visible link
|
|
12
14
|
'img' => :src, # Visible image
|
|
@@ -15,12 +17,19 @@ module Html2rss
|
|
|
15
17
|
'video' => :src # Video player is visible
|
|
16
18
|
}.freeze
|
|
17
19
|
|
|
20
|
+
# @param channel_url [String, Html2rss::Url] base URL used to resolve relative links
|
|
18
21
|
def initialize(channel_url)
|
|
19
22
|
@channel_url = channel_url
|
|
20
23
|
end
|
|
21
24
|
|
|
22
25
|
##
|
|
23
26
|
# Transforms URLs to absolute ones.
|
|
27
|
+
#
|
|
28
|
+
# @param node_name [String] node name currently being transformed
|
|
29
|
+
# @param node [Nokogiri::XML::Node] node currently being transformed
|
|
30
|
+
# @param _env [Hash] transformer context
|
|
31
|
+
# @option _env [Object] :_reserved reserved for transformer pipeline context
|
|
32
|
+
# @return [void]
|
|
24
33
|
def call(node_name:, node:, **_env)
|
|
25
34
|
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
|
|
26
35
|
|
|
@@ -12,6 +12,8 @@ module Html2rss
|
|
|
12
12
|
#
|
|
13
13
|
# @param node_name [String]
|
|
14
14
|
# @param node [Nokogiri::XML::Node]
|
|
15
|
+
# @param _env [Hash] transformer context
|
|
16
|
+
# @option _env [Object] :_reserved reserved for transformer pipeline context
|
|
15
17
|
# @return [nil]
|
|
16
18
|
def call(node_name:, node:, **_env)
|
|
17
19
|
return unless should_process?(node_name)
|
|
@@ -19,10 +21,14 @@ module Html2rss
|
|
|
19
21
|
wrap_image_in_anchor(node) unless already_wrapped?(node)
|
|
20
22
|
end
|
|
21
23
|
|
|
24
|
+
# @param node_name [String] node name currently being transformed
|
|
25
|
+
# @return [Boolean] whether this transformer should run for the node
|
|
22
26
|
def should_process?(node_name)
|
|
23
27
|
node_name == 'img'
|
|
24
28
|
end
|
|
25
29
|
|
|
30
|
+
# @param node [Nokogiri::XML::Node] node currently being transformed
|
|
31
|
+
# @return [Boolean] whether the image is already wrapped in a link
|
|
26
32
|
def already_wrapped?(node)
|
|
27
33
|
node.parent.name == 'a'
|
|
28
34
|
end
|
|
@@ -34,6 +34,9 @@ module Html2rss
|
|
|
34
34
|
#
|
|
35
35
|
# <p>Price: 12.34</p>
|
|
36
36
|
class MarkdownToHtml < Base
|
|
37
|
+
# @param value [String] extracted selector value
|
|
38
|
+
# @param context [Selectors::Context] post-processor context
|
|
39
|
+
# @return [void]
|
|
37
40
|
def self.validate_args!(value, context)
|
|
38
41
|
assert_type value, String, :value, context:
|
|
39
42
|
end
|
|
@@ -27,6 +27,9 @@ module Html2rss
|
|
|
27
27
|
#
|
|
28
28
|
# It uses `Time.parse`.
|
|
29
29
|
class ParseTime < Base
|
|
30
|
+
# @param value [String] extracted selector value
|
|
31
|
+
# @param context [Selectors::Context] post-processor context
|
|
32
|
+
# @return [void]
|
|
30
33
|
def self.validate_args!(value, context)
|
|
31
34
|
assert_type(value, String, :value, context:)
|
|
32
35
|
time_zone_value = time_zone(context)
|
|
@@ -38,6 +41,8 @@ module Html2rss
|
|
|
38
41
|
assert_type(time_zone_value, String, :time_zone, context:)
|
|
39
42
|
end
|
|
40
43
|
|
|
44
|
+
# @param context [Selectors::Context] post-processor context
|
|
45
|
+
# @return [String, nil] configured channel time zone
|
|
41
46
|
def self.time_zone(context) = context.dig(:config, :channel, :time_zone)
|
|
42
47
|
|
|
43
48
|
##
|
|
@@ -23,6 +23,9 @@ module Html2rss
|
|
|
23
23
|
# Would return:
|
|
24
24
|
# 'http://why-not-use-a-link.uh'
|
|
25
25
|
class ParseUri < Base
|
|
26
|
+
# @param value [String] extracted selector value
|
|
27
|
+
# @param _context [Selectors::Context] post-processor context
|
|
28
|
+
# @return [void]
|
|
26
29
|
def self.validate_args!(value, _context)
|
|
27
30
|
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
|
28
31
|
end
|
|
@@ -83,6 +83,9 @@ module Html2rss
|
|
|
83
83
|
'preload' => 'none'
|
|
84
84
|
}
|
|
85
85
|
}.freeze
|
|
86
|
+
# @param value [String] extracted selector value
|
|
87
|
+
# @param context [Selectors::Context] post-processor context
|
|
88
|
+
# @return [void]
|
|
86
89
|
def self.validate_args!(value, context)
|
|
87
90
|
assert_type value, String, :value, context:
|
|
88
91
|
end
|
|
@@ -95,7 +98,8 @@ module Html2rss
|
|
|
95
98
|
def self.get(html, url)
|
|
96
99
|
return nil if String(html).empty?
|
|
97
100
|
|
|
98
|
-
new(
|
|
101
|
+
context = Selectors::Context.new(config: { channel: { url: } }, options: {})
|
|
102
|
+
new(html, context).get
|
|
99
103
|
end
|
|
100
104
|
|
|
101
105
|
##
|
|
@@ -30,6 +30,9 @@ module Html2rss
|
|
|
30
30
|
# Would return:
|
|
31
31
|
# 'bar'
|
|
32
32
|
class Substring < Base
|
|
33
|
+
# @param value [String] extracted selector value
|
|
34
|
+
# @param context [Selectors::Context] post-processor context
|
|
35
|
+
# @return [void]
|
|
33
36
|
def self.validate_args!(value, context)
|
|
34
37
|
assert_type value, String, :value, context:
|
|
35
38
|
|
|
@@ -34,6 +34,9 @@ module Html2rss
|
|
|
34
34
|
# Would return:
|
|
35
35
|
# 'Product (23,42€)'
|
|
36
36
|
class Template < Base
|
|
37
|
+
# @param value [String] extracted selector value
|
|
38
|
+
# @param context [Selectors::Context] post-processor context
|
|
39
|
+
# @return [void]
|
|
37
40
|
def self.validate_args!(value, context)
|
|
38
41
|
assert_type value, String, :value, context:
|
|
39
42
|
|
|
@@ -34,6 +34,11 @@ module Html2rss
|
|
|
34
34
|
|
|
35
35
|
##
|
|
36
36
|
# Shorthand method to instantiate the post processor and call `#get` on it
|
|
37
|
+
#
|
|
38
|
+
# @param name [String, Symbol] post-processor name from selector config
|
|
39
|
+
# @param value [Object] extracted selector value
|
|
40
|
+
# @param context [Selectors::Context] post-processor context
|
|
41
|
+
# @return [Object] transformed selector value
|
|
37
42
|
def self.get(name, value, context)
|
|
38
43
|
klass = NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Unknown name '#{name}'")
|
|
39
44
|
klass.new(value, context).get
|
data/lib/html2rss/selectors.rb
CHANGED
|
@@ -20,10 +20,14 @@ module Html2rss
|
|
|
20
20
|
# A context instance passed to item extractors and post-processors.
|
|
21
21
|
Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
22
22
|
|
|
23
|
+
# Default selectors options merged into user configuration.
|
|
23
24
|
DEFAULT_CONFIG = { items: { enhance: true } }.freeze
|
|
24
25
|
|
|
26
|
+
# Selector key that points to the root list of article nodes.
|
|
25
27
|
ITEMS_SELECTOR_KEY = :items
|
|
28
|
+
# Supported RSS item attributes extractable through selectors.
|
|
26
29
|
ITEM_TAGS = %i[title url description author comments published_at guid enclosure categories].freeze
|
|
30
|
+
# Item attributes that require dedicated extraction logic.
|
|
27
31
|
SPECIAL_ATTRIBUTES = Set[:guid, :enclosure, :categories].freeze
|
|
28
32
|
|
|
29
33
|
# Mapping of new attribute names to their legacy names for backward compatibility.
|
|
@@ -85,6 +89,7 @@ module Html2rss
|
|
|
85
89
|
# Extracts an article hash for a given item element.
|
|
86
90
|
#
|
|
87
91
|
# @param item [Nokogiri::XML::Element] The element to extract from.
|
|
92
|
+
# @param page_response [RequestService::Response] response used for selector extraction context
|
|
88
93
|
# @return [Hash] Hash of attributes for the article.
|
|
89
94
|
def extract_article(item, page_response = response)
|
|
90
95
|
@rss_item_attributes.to_h { |key| [key, select(key, item, base_url: page_response.url)] }.compact
|
|
@@ -96,6 +101,7 @@ module Html2rss
|
|
|
96
101
|
#
|
|
97
102
|
# @param article_hash [Hash] The original article hash.
|
|
98
103
|
# @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
|
|
104
|
+
# @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
|
|
99
105
|
# @return [Hash] The enhanced article hash.
|
|
100
106
|
def enhance_article_hash(article_hash, article_tag, base_url = @url)
|
|
101
107
|
selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
|
|
@@ -116,6 +122,7 @@ module Html2rss
|
|
|
116
122
|
#
|
|
117
123
|
# @param name [Symbol, String] Name of the attribute.
|
|
118
124
|
# @param item [Nokogiri::XML::Element] The HTML element to process.
|
|
125
|
+
# @param base_url [String, Html2rss::Url] base URL for relative extraction values
|
|
119
126
|
# @return [Object, Array<Object>] The selected value(s).
|
|
120
127
|
# @raise [InvalidSelectorName] If the attribute name is invalid or not defined.
|
|
121
128
|
def select(name, item, base_url: @url)
|
data/lib/html2rss/url.rb
CHANGED
|
@@ -25,6 +25,7 @@ module Html2rss
|
|
|
25
25
|
|
|
26
26
|
# Regular expression for basic URI format validation
|
|
27
27
|
URI_REGEXP = Addressable::URI::URIREGEX
|
|
28
|
+
# Schemes accepted by channel URL validation.
|
|
28
29
|
SUPPORTED_SCHEMES = %w[http https].to_set.freeze
|
|
29
30
|
|
|
30
31
|
##
|
|
@@ -107,7 +108,10 @@ module Html2rss
|
|
|
107
108
|
def self.validate_channel_url(url)
|
|
108
109
|
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
|
109
110
|
|
|
110
|
-
|
|
111
|
+
uri = Addressable::URI.parse(url.to_s)
|
|
112
|
+
has_forbidden_at = uri.user || uri.password
|
|
113
|
+
has_forbidden_at ||= [uri.query, uri.fragment].compact.any? { |value| value.include?('@') }
|
|
114
|
+
raise ArgumentError, 'URL must not contain an @ character' if has_forbidden_at
|
|
111
115
|
|
|
112
116
|
scheme = url.scheme
|
|
113
117
|
raise ArgumentError, "URL scheme '#{scheme}' is not supported" unless SUPPORTED_SCHEMES.include?(scheme)
|
|
@@ -122,31 +126,41 @@ module Html2rss
|
|
|
122
126
|
freeze
|
|
123
127
|
end
|
|
124
128
|
|
|
125
|
-
#
|
|
129
|
+
# @return [String] normalized URL string
|
|
126
130
|
def to_s = @uri.to_s
|
|
131
|
+
|
|
132
|
+
# @return [String, nil] URI scheme, for example `http` or `https`
|
|
127
133
|
def scheme = @uri.scheme
|
|
134
|
+
|
|
135
|
+
# @return [String, nil] URI host component
|
|
128
136
|
def host = @uri.host
|
|
137
|
+
|
|
138
|
+
# @return [Integer, nil] URI port component
|
|
129
139
|
def port = @uri.port
|
|
140
|
+
|
|
141
|
+
# @return [String, nil] URI path component
|
|
130
142
|
def path = @uri.path
|
|
143
|
+
|
|
144
|
+
# @return [String, nil] URI query string without leading `?`
|
|
131
145
|
def query = @uri.query
|
|
146
|
+
|
|
147
|
+
# @return [String, nil] URI fragment without leading `#`
|
|
132
148
|
def fragment = @uri.fragment
|
|
149
|
+
|
|
150
|
+
# @return [Boolean] whether the URL includes scheme and host
|
|
133
151
|
def absolute? = @uri.absolute?
|
|
134
152
|
|
|
135
153
|
##
|
|
136
154
|
# Returns the URL query string as a hash of string keys and values.
|
|
137
155
|
#
|
|
138
156
|
# @return [Hash{String => String}] normalized query parameters
|
|
139
|
-
def query_values
|
|
140
|
-
@uri.query_values(Hash) || {}
|
|
141
|
-
end
|
|
157
|
+
def query_values = @uri.query_values(Hash) || {}
|
|
142
158
|
|
|
143
159
|
##
|
|
144
160
|
# Returns the URL path split into non-empty segments.
|
|
145
161
|
#
|
|
146
162
|
# @return [Array<String>] normalized path segments
|
|
147
|
-
def path_segments
|
|
148
|
-
@uri.path.to_s.split('/').reject(&:empty?)
|
|
149
|
-
end
|
|
163
|
+
def path_segments = @uri.path.to_s.split('/').reject(&:empty?)
|
|
150
164
|
|
|
151
165
|
##
|
|
152
166
|
# Returns a copy of the URL with the provided path.
|
|
@@ -221,42 +235,32 @@ module Html2rss
|
|
|
221
235
|
#
|
|
222
236
|
# @param other [Url] the other URL to compare with
|
|
223
237
|
# @return [Integer] -1, 0, or 1 for less than, equal, or greater than
|
|
224
|
-
def <=>(other)
|
|
225
|
-
to_s <=> other.to_s
|
|
226
|
-
end
|
|
238
|
+
def <=>(other) = to_s <=> other.to_s
|
|
227
239
|
|
|
228
240
|
##
|
|
229
241
|
# Returns true if this URL is equal to another URL.
|
|
230
242
|
#
|
|
231
243
|
# @param other [Object] the other object to compare with
|
|
232
244
|
# @return [Boolean] true if the URLs are equal
|
|
233
|
-
def ==(other)
|
|
234
|
-
other.is_a?(Url) && to_s == other.to_s
|
|
235
|
-
end
|
|
245
|
+
def ==(other) = other.is_a?(Url) && to_s == other.to_s
|
|
236
246
|
|
|
237
247
|
##
|
|
238
248
|
# Supports hash-based comparisons by ensuring equality semantics match `hash`.
|
|
239
249
|
#
|
|
240
250
|
# @param other [Object] the other object to compare with
|
|
241
251
|
# @return [Boolean] true if the URLs are considered equal
|
|
242
|
-
def eql?(other)
|
|
243
|
-
other.is_a?(Url) && to_s == other.to_s
|
|
244
|
-
end
|
|
252
|
+
def eql?(other) = other.is_a?(Url) && to_s == other.to_s
|
|
245
253
|
|
|
246
254
|
##
|
|
247
255
|
# Returns the hash code for this URL.
|
|
248
256
|
#
|
|
249
257
|
# @return [Integer] the hash code
|
|
250
|
-
def hash
|
|
251
|
-
to_s.hash
|
|
252
|
-
end
|
|
258
|
+
def hash = to_s.hash
|
|
253
259
|
|
|
254
260
|
##
|
|
255
261
|
# Returns a string representation of the URL for debugging.
|
|
256
262
|
#
|
|
257
263
|
# @return [String] the debug representation
|
|
258
|
-
def inspect
|
|
259
|
-
"#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
|
|
260
|
-
end
|
|
264
|
+
def inspect = "#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
|
|
261
265
|
end
|
|
262
266
|
end
|