html2rss 0.18.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Builds feeds from validated config through request, extraction, and rendering stages.
|
|
6
|
+
class FeedPipeline
|
|
7
|
+
##
|
|
8
|
+
# @param raw_config [Hash{Symbol => Object}] user-provided feed config
|
|
9
|
+
def initialize(raw_config)
|
|
10
|
+
@raw_config = raw_config
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
##
|
|
14
|
+
# @return [RSS::Rss] generated RSS feed
|
|
15
|
+
def to_rss
|
|
16
|
+
run do |response:, config:, articles:|
|
|
17
|
+
channel = RssBuilder::Channel.new(response, overrides: config.channel)
|
|
18
|
+
RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# @return [Hash] generated JSONFeed 1.1 payload
|
|
24
|
+
def to_json_feed
|
|
25
|
+
run do |response:, config:, articles:|
|
|
26
|
+
channel = RssBuilder::Channel.new(response, overrides: config.channel)
|
|
27
|
+
JsonFeedBuilder.new(channel:, articles:).call
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
attr_reader :raw_config
|
|
34
|
+
|
|
35
|
+
def run
|
|
36
|
+
config = Config.from_hash(raw_config, params: raw_config[:params])
|
|
37
|
+
state = pipeline_state_for(config)
|
|
38
|
+
yield response: state.fetch(:response), config:, articles: state.fetch(:articles)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def pipeline_state_for(config)
|
|
42
|
+
if config.strategy == :auto
|
|
43
|
+
run_auto_pipeline(config)
|
|
44
|
+
else
|
|
45
|
+
run_pipeline_for_strategy(config, strategy: config.strategy)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def run_pipeline_for_strategy(config, strategy:, budget: nil)
|
|
50
|
+
request_session = request_session_for(config, strategy:, budget:)
|
|
51
|
+
response = request_session.fetch_initial_response
|
|
52
|
+
articles = deduplicated_articles(response:, config:, request_session:)
|
|
53
|
+
{ response:, articles: }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def request_session_for(config, strategy:, budget: nil)
|
|
57
|
+
RequestSession.from_runtime_input(runtime_input_for(config, strategy:), budget:)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def runtime_input_for(config, strategy:)
|
|
61
|
+
RequestSession::RuntimeInput.new(
|
|
62
|
+
url: config.url,
|
|
63
|
+
headers: config.headers,
|
|
64
|
+
request: config.request,
|
|
65
|
+
strategy:,
|
|
66
|
+
request_policy: RequestSession::RuntimePolicy.from_config(config)
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def deduplicated_articles(response:, config:, request_session:)
|
|
71
|
+
Articles::Deduplicator.new(
|
|
72
|
+
collect_articles(response:, config:, request_session:)
|
|
73
|
+
).call
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def run_auto_pipeline(config)
|
|
77
|
+
auto_fallback_for(config).call
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def auto_fallback_for(config)
|
|
81
|
+
AutoFallback.new(
|
|
82
|
+
strategies: AutoFallback::CHAIN,
|
|
83
|
+
budget: auto_pipeline_budget(config),
|
|
84
|
+
session_for: lambda do |strategy:, budget:|
|
|
85
|
+
request_session_for(config, strategy:, budget:)
|
|
86
|
+
end,
|
|
87
|
+
articles_for: lambda do |response:, request_session:|
|
|
88
|
+
deduplicated_articles(response:, config:, request_session:)
|
|
89
|
+
end
|
|
90
|
+
)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def auto_pipeline_budget(config)
|
|
94
|
+
max_requests = RequestSession::RuntimePolicy.from_config(config).max_requests
|
|
95
|
+
RequestService::Budget.new(max_requests:)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def collect_articles(response:, config:, request_session:)
|
|
99
|
+
selector_articles(response:, config:, request_session:) +
|
|
100
|
+
auto_source_articles(response:, config:, request_session:)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
|
|
104
|
+
return [] unless (selectors = config.selectors)
|
|
105
|
+
|
|
106
|
+
page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
|
|
107
|
+
RequestSession::RelNextPager.new(
|
|
108
|
+
session: request_session,
|
|
109
|
+
initial_response: response,
|
|
110
|
+
max_pages:
|
|
111
|
+
).to_a
|
|
112
|
+
else
|
|
113
|
+
[response]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
page_responses.flat_map do |page_response|
|
|
117
|
+
Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def auto_source_articles(response:, config:, request_session:)
|
|
122
|
+
return [] unless (auto_source = config.auto_source)
|
|
123
|
+
|
|
124
|
+
AutoSource.new(response, auto_source, request_session:).articles
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
# Shared helpers for hash normalization and structural operations.
|
|
5
|
+
module HashUtil
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
# Deeply duplicates nested arrays and hashes.
|
|
9
|
+
#
|
|
10
|
+
# @param object [Object] nested value from configuration or runtime state
|
|
11
|
+
# @return [Object] deep duplicated object
|
|
12
|
+
def deep_dup(object)
|
|
13
|
+
case object
|
|
14
|
+
in Hash
|
|
15
|
+
object.transform_values { deep_dup(_1) }
|
|
16
|
+
in Array
|
|
17
|
+
object.map { deep_dup(_1) }
|
|
18
|
+
else
|
|
19
|
+
object.dup rescue StandardError # rubocop:disable Style/RescueModifier
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Deeply merges nested hashes while replacing non-hash values from override.
|
|
24
|
+
#
|
|
25
|
+
# @param base [Hash] base hash
|
|
26
|
+
# @param override [Hash] override hash
|
|
27
|
+
# @return [Hash] merged hash
|
|
28
|
+
def deep_merge(base, override)
|
|
29
|
+
base.merge(override) do |_key, old_val, new_val|
|
|
30
|
+
case [old_val, new_val]
|
|
31
|
+
in [Hash, Hash]
|
|
32
|
+
deep_merge(old_val, new_val)
|
|
33
|
+
else
|
|
34
|
+
new_val
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Converts string-keyed hashes to symbol-keyed hashes recursively.
|
|
40
|
+
#
|
|
41
|
+
# @param object [Object] value to normalize
|
|
42
|
+
# @param context [String] error context
|
|
43
|
+
# @return [Object] normalized value
|
|
44
|
+
def deep_symbolize_keys(object, context: 'hash')
|
|
45
|
+
case object
|
|
46
|
+
in Hash
|
|
47
|
+
object.each_with_object({}) do |(k, v), memo|
|
|
48
|
+
memo[symbol_key(k, context:)] = deep_symbolize_keys(v, context:)
|
|
49
|
+
end
|
|
50
|
+
in Array
|
|
51
|
+
object.map { deep_symbolize_keys(_1, context:) }
|
|
52
|
+
else
|
|
53
|
+
object
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Validates that hash keys are symbols.
|
|
58
|
+
#
|
|
59
|
+
# @param value [Object] candidate hash container whose keys must be symbols
|
|
60
|
+
# @param context [String] error context
|
|
61
|
+
# @param deep [Boolean] whether nested hashes should also be validated
|
|
62
|
+
# @return [void]
|
|
63
|
+
def assert_symbol_keys!(value, context: 'hash', deep: true)
|
|
64
|
+
return unless value in Hash
|
|
65
|
+
|
|
66
|
+
unless value.each_key.all?(Symbol)
|
|
67
|
+
invalid_key = value.keys.find { _1.class != Symbol }
|
|
68
|
+
raise ArgumentError, "#{context} must use symbol keys (found #{invalid_key.inspect})"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
value.each_value { assert_symbol_keys!(_1, context:, deep:) } if deep
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Validates that hash keys are strings.
|
|
75
|
+
#
|
|
76
|
+
# @param value [Object] candidate hash container whose keys must be strings
|
|
77
|
+
# @param context [String] error context
|
|
78
|
+
# @param deep [Boolean] whether nested hashes should also be validated
|
|
79
|
+
# @return [void]
|
|
80
|
+
def assert_string_keys!(value, context: 'hash', deep: true)
|
|
81
|
+
return unless value in Hash
|
|
82
|
+
|
|
83
|
+
unless value.each_key.all?(String)
|
|
84
|
+
invalid_key = value.keys.find { _1.class != String }
|
|
85
|
+
raise ArgumentError, "#{context} must use string keys (found #{invalid_key.inspect})"
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
value.each_value { assert_string_keys!(_1, context:, deep:) } if deep
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def symbol_key(key, context:)
|
|
92
|
+
case key
|
|
93
|
+
in Symbol then key
|
|
94
|
+
in String then key.to_sym
|
|
95
|
+
else
|
|
96
|
+
raise ArgumentError, "#{context} must use string or symbol keys (found #{key.inspect})"
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
private_class_method :symbol_key
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -4,6 +4,7 @@ module Html2rss
|
|
|
4
4
|
class HtmlExtractor
|
|
5
5
|
# Extracts the earliest date from an article_tag.
|
|
6
6
|
class DateExtractor
|
|
7
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
7
8
|
# @return [DateTime, nil]
|
|
8
9
|
def self.call(article_tag)
|
|
9
10
|
times = article_tag.css('[datetime]').filter_map do |tag|
|
|
@@ -5,6 +5,9 @@ module Html2rss
|
|
|
5
5
|
##
|
|
6
6
|
# Extracts enclosures from HTML tags using various strategies.
|
|
7
7
|
class EnclosureExtractor
|
|
8
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
9
|
+
# @param base_url [String, Html2rss::Url] base URL for relative enclosure links
|
|
10
|
+
# @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
|
|
8
11
|
def self.call(article_tag, base_url)
|
|
9
12
|
[
|
|
10
13
|
Extractors::Image,
|
|
@@ -16,10 +19,14 @@ module Html2rss
|
|
|
16
19
|
end
|
|
17
20
|
end
|
|
18
21
|
|
|
22
|
+
# Extraction strategies for enclosure-like media/link tags.
|
|
19
23
|
module Extractors
|
|
20
24
|
# Extracts image enclosures from HTML tags.
|
|
21
25
|
# Finds all image sources and returns them in a format suitable for RSS.
|
|
22
26
|
class Image
|
|
27
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
28
|
+
# @param base_url [String, Html2rss::Url] base URL for relative image sources
|
|
29
|
+
# @return [Array<Hash{Symbol => Object}>] image enclosure hashes
|
|
23
30
|
def self.call(article_tag, base_url:)
|
|
24
31
|
article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
|
|
25
32
|
src = img['src'].to_s
|
|
@@ -36,6 +43,9 @@ module Html2rss
|
|
|
36
43
|
|
|
37
44
|
# Extracts media enclosures (video/audio) from HTML tags.
|
|
38
45
|
class Media
|
|
46
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
47
|
+
# @param base_url [String, Html2rss::Url] base URL for relative media sources
|
|
48
|
+
# @return [Array<Hash{Symbol => Object}>] media enclosure hashes
|
|
39
49
|
def self.call(article_tag, base_url:)
|
|
40
50
|
article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
|
|
41
51
|
src = element['src'].to_s
|
|
@@ -51,6 +61,9 @@ module Html2rss
|
|
|
51
61
|
|
|
52
62
|
# Extracts PDF enclosures from HTML tags.
|
|
53
63
|
class Pdf
|
|
64
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
65
|
+
# @param base_url [String, Html2rss::Url] base URL for relative PDF links
|
|
66
|
+
# @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
|
|
54
67
|
def self.call(article_tag, base_url:)
|
|
55
68
|
article_tag.css('a[href$=".pdf"]').filter_map do |link|
|
|
56
69
|
href = link['href'].to_s
|
|
@@ -67,6 +80,9 @@ module Html2rss
|
|
|
67
80
|
|
|
68
81
|
# Extracts iframe enclosures from HTML tags.
|
|
69
82
|
class Iframe
|
|
83
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
84
|
+
# @param base_url [String, Html2rss::Url] base URL for relative iframe links
|
|
85
|
+
# @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
|
|
70
86
|
def self.call(article_tag, base_url:)
|
|
71
87
|
article_tag.css('iframe[src]').filter_map do |iframe|
|
|
72
88
|
src = iframe['src']
|
|
@@ -83,6 +99,9 @@ module Html2rss
|
|
|
83
99
|
|
|
84
100
|
# Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
|
|
85
101
|
class Archive
|
|
102
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
103
|
+
# @param base_url [String, Html2rss::Url] base URL for relative archive links
|
|
104
|
+
# @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
|
|
86
105
|
def self.call(article_tag, base_url:)
|
|
87
106
|
article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
|
|
88
107
|
href = link['href'].to_s
|
|
@@ -5,6 +5,9 @@ module Html2rss
|
|
|
5
5
|
##
|
|
6
6
|
# Image is responsible for extracting image URLs the article_tag.
|
|
7
7
|
class ImageExtractor
|
|
8
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
9
|
+
# @param base_url [String, Html2rss::Url] base URL for relative image URLs
|
|
10
|
+
# @return [Html2rss::Url, nil] best candidate image URL
|
|
8
11
|
def self.call(article_tag, base_url:)
|
|
9
12
|
img_src = from_source(article_tag) ||
|
|
10
13
|
from_img(article_tag) ||
|
|
@@ -13,6 +16,8 @@ module Html2rss
|
|
|
13
16
|
Url.from_relative(img_src, base_url) if img_src
|
|
14
17
|
end
|
|
15
18
|
|
|
19
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
20
|
+
# @return [String, nil] src attribute from first matching image tag
|
|
16
21
|
def self.from_img(article_tag)
|
|
17
22
|
article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
|
|
18
23
|
end
|
|
@@ -21,6 +26,8 @@ module Html2rss
|
|
|
21
26
|
# Extracts the largest image source from the srcset attribute
|
|
22
27
|
# of an img tag or a source tag inside a picture tag.
|
|
23
28
|
#
|
|
29
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
30
|
+
# @return [String, nil] largest srcset URL candidate
|
|
24
31
|
# @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
|
|
25
32
|
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
|
|
26
33
|
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
|
|
@@ -38,6 +45,8 @@ module Html2rss
|
|
|
38
45
|
hash[hash.keys.max]
|
|
39
46
|
end
|
|
40
47
|
|
|
48
|
+
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
49
|
+
# @return [String, nil] best style-based background image URL
|
|
41
50
|
def self.from_style(article_tag)
|
|
42
51
|
article_tag.css('[style*="url"]')
|
|
43
52
|
.filter_map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
|
|
@@ -5,10 +5,14 @@ module Html2rss
|
|
|
5
5
|
# HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
|
|
6
6
|
# from an article_tag.
|
|
7
7
|
class HtmlExtractor
|
|
8
|
+
# Tags ignored when extracting visible text content from article containers.
|
|
8
9
|
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
10
|
+
# Heading tags used to prioritize title extraction.
|
|
9
11
|
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
12
|
+
# Selector used to derive non-headline description nodes.
|
|
10
13
|
NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
|
|
11
14
|
|
|
15
|
+
# Anchor selector used to identify the canonical article link element.
|
|
12
16
|
MAIN_ANCHOR_SELECTOR = begin
|
|
13
17
|
buf = +'a[href]:not([href=""])'
|
|
14
18
|
%w[# javascript: mailto: tel: file:// sms: data:].each do |prefix|
|
|
@@ -56,6 +60,7 @@ module Html2rss
|
|
|
56
60
|
@selected_anchor = selected_anchor
|
|
57
61
|
end
|
|
58
62
|
|
|
63
|
+
# @return [Hash{Symbol => Object}] extracted article attributes
|
|
59
64
|
def call
|
|
60
65
|
{
|
|
61
66
|
title: extract_title,
|
|
@@ -23,6 +23,10 @@ module Html2rss
|
|
|
23
23
|
##
|
|
24
24
|
# Think of it as `css_upwards` method.
|
|
25
25
|
# It searches for the closest parent that matches the given selector.
|
|
26
|
+
#
|
|
27
|
+
# @param current_tag [Nokogiri::XML::Node, nil] starting node
|
|
28
|
+
# @param selector [String] CSS selector to search upwards for
|
|
29
|
+
# @return [Nokogiri::XML::Node, nil] first matching node in upward traversal
|
|
26
30
|
def find_closest_selector_upwards(current_tag, selector)
|
|
27
31
|
while current_tag
|
|
28
32
|
found = current_tag.at_css(selector)
|
|
@@ -36,6 +40,10 @@ module Html2rss
|
|
|
36
40
|
|
|
37
41
|
##
|
|
38
42
|
# Searches for the closest parent that matches the given tag name.
|
|
43
|
+
#
|
|
44
|
+
# @param current_tag [Nokogiri::XML::Node] starting node
|
|
45
|
+
# @param tag_name [String] tag name to find in ancestors
|
|
46
|
+
# @return [Nokogiri::XML::Node, nil] matching ancestor node
|
|
39
47
|
def find_tag_in_ancestors(current_tag, tag_name)
|
|
40
48
|
return current_tag if current_tag.name == tag_name
|
|
41
49
|
|
|
@@ -6,15 +6,20 @@ module Html2rss
|
|
|
6
6
|
module Rendering
|
|
7
7
|
# Renders an HTML <audio> tag from a URL and type.
|
|
8
8
|
class AudioRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] media URL for the audio source
|
|
10
|
+
# @param type [String] MIME type for the audio source
|
|
9
11
|
def initialize(url:, type:)
|
|
10
12
|
@url = url
|
|
11
13
|
@type = type
|
|
12
14
|
end
|
|
13
15
|
|
|
16
|
+
# @return [String] HTML audio snippet for article rendering
|
|
14
17
|
def to_html
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
[
|
|
19
|
+
'<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">',
|
|
20
|
+
%(<source src="#{escaped_url}" type="#{escaped_type}">),
|
|
21
|
+
'</audio>'
|
|
22
|
+
].join
|
|
18
23
|
end
|
|
19
24
|
|
|
20
25
|
private
|
|
@@ -16,7 +16,6 @@ module Html2rss
|
|
|
16
16
|
# image: "https://example.com/image.jpg"
|
|
17
17
|
# )
|
|
18
18
|
# description = builder.call
|
|
19
|
-
#
|
|
20
19
|
class DescriptionBuilder
|
|
21
20
|
# Removes the specified pattern from the beginning of the text
|
|
22
21
|
# within a given range if the pattern occurs before the range's end.
|
|
@@ -6,23 +6,33 @@ module Html2rss
|
|
|
6
6
|
module Rendering
|
|
7
7
|
# Renders an HTML <img> tag from a URL and title.
|
|
8
8
|
class ImageRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] image URL for the src attribute
|
|
10
|
+
# @param title [String, nil] title/alt text for the image
|
|
9
11
|
def initialize(url:, title:)
|
|
10
12
|
@url = url
|
|
11
13
|
@title = title
|
|
12
14
|
end
|
|
13
15
|
|
|
16
|
+
# @return [String] HTML image snippet for article rendering
|
|
14
17
|
def to_html
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
attributes = [
|
|
19
|
+
%(src="#{escaped_url}"),
|
|
20
|
+
%(alt="#{escaped_title}"),
|
|
21
|
+
%(title="#{escaped_title}"),
|
|
22
|
+
'loading="lazy"',
|
|
23
|
+
'referrerpolicy="no-referrer"',
|
|
24
|
+
'decoding="async"',
|
|
25
|
+
'crossorigin="anonymous"'
|
|
26
|
+
]
|
|
27
|
+
"<img #{attributes.join(' ')}>"
|
|
22
28
|
end
|
|
23
29
|
|
|
24
30
|
private
|
|
25
31
|
|
|
32
|
+
def escaped_url
|
|
33
|
+
CGI.escapeHTML(@url.to_s)
|
|
34
|
+
end
|
|
35
|
+
|
|
26
36
|
def escaped_title
|
|
27
37
|
CGI.escapeHTML(@title.to_s)
|
|
28
38
|
end
|
|
@@ -16,6 +16,10 @@ module Html2rss
|
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
# @private
|
|
19
|
+
# @param type [String, nil] enclosure MIME type
|
|
20
|
+
# @param url [String, Html2rss::Url] enclosure URL
|
|
21
|
+
# @param title [String, nil] title used by image renderer
|
|
22
|
+
# @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
|
|
19
23
|
def self.create_renderer_for_type(type, url:, title:)
|
|
20
24
|
case type
|
|
21
25
|
when %r{^image/}
|
|
@@ -6,16 +6,22 @@ module Html2rss
|
|
|
6
6
|
module Rendering
|
|
7
7
|
# Renders an HTML <iframe> for PDF documents.
|
|
8
8
|
class PdfRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] PDF URL rendered in the iframe
|
|
9
10
|
def initialize(url:)
|
|
10
11
|
@url = url
|
|
11
12
|
end
|
|
12
13
|
|
|
14
|
+
# @return [String] HTML iframe snippet for PDF rendering
|
|
13
15
|
def to_html
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
attributes = [
|
|
17
|
+
%(src="#{escaped_url}"),
|
|
18
|
+
'width="100%"',
|
|
19
|
+
'height="75vh"',
|
|
20
|
+
'sandbox=""',
|
|
21
|
+
'referrerpolicy="no-referrer"',
|
|
22
|
+
'loading="lazy"'
|
|
23
|
+
]
|
|
24
|
+
"<iframe #{attributes.join(' ')}></iframe>"
|
|
19
25
|
end
|
|
20
26
|
|
|
21
27
|
private
|
|
@@ -6,15 +6,20 @@ module Html2rss
|
|
|
6
6
|
module Rendering
|
|
7
7
|
# Renders an HTML <video> tag from a URL and type.
|
|
8
8
|
class VideoRenderer
|
|
9
|
+
# @param url [String, Html2rss::Url] media URL for the video source
|
|
10
|
+
# @param type [String] MIME type for the video source
|
|
9
11
|
def initialize(url:, type:)
|
|
10
12
|
@url = url
|
|
11
13
|
@type = type
|
|
12
14
|
end
|
|
13
15
|
|
|
16
|
+
# @return [String] HTML video snippet for article rendering
|
|
14
17
|
def to_html
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
[
|
|
19
|
+
'<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>',
|
|
20
|
+
%(<source src="#{escaped_url}" type="#{escaped_type}">),
|
|
21
|
+
'</video>'
|
|
22
|
+
].join
|
|
18
23
|
end
|
|
19
24
|
|
|
20
25
|
private
|
data/lib/html2rss/rendering.rb
CHANGED
|
@@ -5,8 +5,17 @@ module Html2rss
|
|
|
5
5
|
# images, audio, video, or embedded documents for feed descriptions.
|
|
6
6
|
#
|
|
7
7
|
# @example
|
|
8
|
-
# Html2rss::Rendering::ImageRenderer.new(
|
|
9
|
-
#
|
|
8
|
+
# Html2rss::Rendering::ImageRenderer.new(
|
|
9
|
+
# url: "https://example.com/image.jpg",
|
|
10
|
+
# title: "Example"
|
|
11
|
+
# ).to_html
|
|
12
|
+
#
|
|
13
|
+
# @example
|
|
14
|
+
# Html2rss::Rendering::MediaRenderer.for(
|
|
15
|
+
# enclosure: nil,
|
|
16
|
+
# image: "https://example.com/image.jpg",
|
|
17
|
+
# title: "Example"
|
|
18
|
+
# )
|
|
10
19
|
#
|
|
11
20
|
# @see Html2rss::Rendering::DescriptionBuilder
|
|
12
21
|
module Rendering
|
|
@@ -4,15 +4,20 @@ module Html2rss
|
|
|
4
4
|
##
|
|
5
5
|
# Tracks runtime request controls together with whether each value was explicitly set.
|
|
6
6
|
class RequestControls
|
|
7
|
+
# Request-control keys accepted at the top level of feed config.
|
|
7
8
|
TOP_LEVEL_KEYS = %i[strategy].freeze
|
|
9
|
+
# Request-control keys accepted under the nested `request` config.
|
|
8
10
|
REQUEST_KEYS = %i[max_redirects max_requests].freeze
|
|
9
11
|
|
|
10
12
|
##
|
|
11
|
-
# @param config [Hash
|
|
13
|
+
# @param config [Hash{Symbol => Object}] raw config input
|
|
12
14
|
# @return [RequestControls] request controls extracted from the config hash
|
|
13
15
|
def self.from_config(config)
|
|
16
|
+
HashUtil.assert_symbol_keys!(config, context: 'config', deep: false)
|
|
17
|
+
HashUtil.assert_symbol_keys!(config[:request], context: 'config[:request]') if config[:request].is_a?(Hash)
|
|
18
|
+
|
|
14
19
|
new(
|
|
15
|
-
strategy:
|
|
20
|
+
strategy: config[:strategy],
|
|
16
21
|
max_redirects: request_value_for(config, :max_redirects),
|
|
17
22
|
max_requests: request_value_for(config, :max_requests),
|
|
18
23
|
explicit_keys: explicit_keys_for(config)
|
|
@@ -20,33 +25,23 @@ module Html2rss
|
|
|
20
25
|
end
|
|
21
26
|
|
|
22
27
|
def self.explicit_keys_for(config)
|
|
23
|
-
TOP_LEVEL_KEYS.filter {
|
|
28
|
+
TOP_LEVEL_KEYS.filter { config.key?(_1) } +
|
|
24
29
|
REQUEST_KEYS.filter { request_key?(config, _1) }
|
|
25
30
|
end
|
|
26
31
|
|
|
27
|
-
def self.value_for(config, key)
|
|
28
|
-
return config[key] if config.key?(key)
|
|
29
|
-
return config[key.to_s] if config.key?(key.to_s)
|
|
30
|
-
|
|
31
|
-
nil
|
|
32
|
-
end
|
|
33
|
-
|
|
34
32
|
def self.request_value_for(config, key)
|
|
35
|
-
request_config =
|
|
33
|
+
request_config = config[:request]
|
|
36
34
|
return nil unless request_config.is_a?(Hash)
|
|
37
35
|
|
|
38
|
-
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def self.top_level_key?(config, key)
|
|
42
|
-
config.key?(key) || config.key?(key.to_s)
|
|
36
|
+
request_config[key]
|
|
43
37
|
end
|
|
44
38
|
|
|
45
39
|
def self.request_key?(config, key)
|
|
46
|
-
request_config =
|
|
47
|
-
request_config.is_a?(Hash) &&
|
|
40
|
+
request_config = config[:request]
|
|
41
|
+
request_config.is_a?(Hash) && request_config.key?(key)
|
|
48
42
|
end
|
|
49
|
-
|
|
43
|
+
|
|
44
|
+
private_class_method :explicit_keys_for, :request_value_for, :request_key?
|
|
50
45
|
|
|
51
46
|
##
|
|
52
47
|
# @param strategy [Symbol, nil] effective request strategy
|
|
@@ -97,8 +92,8 @@ module Html2rss
|
|
|
97
92
|
##
|
|
98
93
|
# Applies only explicitly set controls to the provided config hash.
|
|
99
94
|
#
|
|
100
|
-
# @param config [Hash
|
|
101
|
-
# @return [Hash
|
|
95
|
+
# @param config [Hash{Symbol => Object}] mutable config hash
|
|
96
|
+
# @return [Hash{Symbol => Object}] the same hash with explicit controls written
|
|
102
97
|
def apply_to(config)
|
|
103
98
|
config[:strategy] = strategy if explicit?(:strategy)
|
|
104
99
|
apply_request_value(config, :max_redirects, max_redirects)
|