html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# Extracts enclosures from HTML tags using various strategies.
|
|
7
|
+
class EnclosureExtractor
|
|
8
|
+
def self.call(article_tag, base_url)
|
|
9
|
+
[
|
|
10
|
+
Extractors::Image,
|
|
11
|
+
Extractors::Media,
|
|
12
|
+
Extractors::Pdf,
|
|
13
|
+
Extractors::Iframe,
|
|
14
|
+
Extractors::Archive
|
|
15
|
+
].flat_map { |strategy| strategy.call(article_tag, base_url:) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
module Extractors
|
|
20
|
+
# Extracts image enclosures from HTML tags.
|
|
21
|
+
# Finds all image sources and returns them in a format suitable for RSS.
|
|
22
|
+
class Image
|
|
23
|
+
def self.call(article_tag, base_url:)
|
|
24
|
+
article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
|
|
25
|
+
src = img['src'].to_s
|
|
26
|
+
next if src.empty?
|
|
27
|
+
|
|
28
|
+
abs_url = Url.from_relative(src, base_url)
|
|
29
|
+
{
|
|
30
|
+
url: abs_url,
|
|
31
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
|
|
32
|
+
}
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Extracts media enclosures (video/audio) from HTML tags.
|
|
38
|
+
class Media
|
|
39
|
+
def self.call(article_tag, base_url:)
|
|
40
|
+
article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
|
|
41
|
+
src = element['src'].to_s
|
|
42
|
+
next if src.empty?
|
|
43
|
+
|
|
44
|
+
{
|
|
45
|
+
url: Url.from_relative(src, base_url),
|
|
46
|
+
type: element['type']
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Extracts PDF enclosures from HTML tags.
|
|
53
|
+
class Pdf
|
|
54
|
+
def self.call(article_tag, base_url:)
|
|
55
|
+
article_tag.css('a[href$=".pdf"]').filter_map do |link|
|
|
56
|
+
href = link['href'].to_s
|
|
57
|
+
next if href.empty?
|
|
58
|
+
|
|
59
|
+
abs_url = Url.from_relative(href, base_url)
|
|
60
|
+
{
|
|
61
|
+
url: abs_url,
|
|
62
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Extracts iframe enclosures from HTML tags.
|
|
69
|
+
class Iframe
|
|
70
|
+
def self.call(article_tag, base_url:)
|
|
71
|
+
article_tag.css('iframe[src]').filter_map do |iframe|
|
|
72
|
+
src = iframe['src']
|
|
73
|
+
next if src.nil? || src.empty?
|
|
74
|
+
|
|
75
|
+
abs_url = Url.from_relative(src, base_url)
|
|
76
|
+
{
|
|
77
|
+
url: abs_url,
|
|
78
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
|
|
85
|
+
class Archive
|
|
86
|
+
def self.call(article_tag, base_url:)
|
|
87
|
+
article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
|
|
88
|
+
href = link['href'].to_s
|
|
89
|
+
next if href.empty?
|
|
90
|
+
|
|
91
|
+
abs_url = Url.from_relative(href, base_url)
|
|
92
|
+
{
|
|
93
|
+
url: abs_url,
|
|
94
|
+
type: 'application/zip'
|
|
95
|
+
}
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# Image is responsible for extracting image URLs the article_tag.
|
|
7
|
+
class ImageExtractor
|
|
8
|
+
def self.call(article_tag, base_url:)
|
|
9
|
+
img_src = from_source(article_tag) ||
|
|
10
|
+
from_img(article_tag) ||
|
|
11
|
+
from_style(article_tag)
|
|
12
|
+
|
|
13
|
+
Url.from_relative(img_src, base_url) if img_src
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.from_img(article_tag)
|
|
17
|
+
article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
##
|
|
21
|
+
# Extracts the largest image source from the srcset attribute
|
|
22
|
+
# of an img tag or a source tag inside a picture tag.
|
|
23
|
+
#
|
|
24
|
+
# @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
|
|
25
|
+
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
|
|
26
|
+
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
|
|
27
|
+
def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
|
|
28
|
+
hash = article_tag.css('img[srcset], picture > source[srcset]').flat_map do |source|
|
|
29
|
+
source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)[\s,]?/).map do |url, width|
|
|
30
|
+
next if url.nil? || url.start_with?('data:')
|
|
31
|
+
|
|
32
|
+
width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
|
|
33
|
+
|
|
34
|
+
[width_value, url.strip]
|
|
35
|
+
end
|
|
36
|
+
end.compact.to_h
|
|
37
|
+
|
|
38
|
+
hash[hash.keys.max]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.from_style(article_tag)
|
|
42
|
+
article_tag.css('[style*="url"]')
|
|
43
|
+
.filter_map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
|
|
44
|
+
.reject { |src| src.start_with?('data:') }
|
|
45
|
+
.max_by(&:size)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
|
|
6
|
+
# from an article_tag.
|
|
7
|
+
class HtmlExtractor
|
|
8
|
+
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
9
|
+
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
10
|
+
NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
|
|
11
|
+
|
|
12
|
+
MAIN_ANCHOR_SELECTOR = begin
|
|
13
|
+
buf = +'a[href]:not([href=""])'
|
|
14
|
+
%w[# javascript: mailto: tel: file:// sms: data:].each do |prefix|
|
|
15
|
+
buf << %[:not([href^="#{prefix}"])]
|
|
16
|
+
end
|
|
17
|
+
buf.freeze
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
class << self
|
|
21
|
+
##
|
|
22
|
+
# Extracts visible text from a given node and its children.
|
|
23
|
+
#
|
|
24
|
+
# @param tag [Nokogiri::XML::Node] the node from which to extract visible text
|
|
25
|
+
# @param separator [String] separator used to join text fragments (default is a space)
|
|
26
|
+
# @return [String, nil] the concatenated visible text, or nil if none is found
|
|
27
|
+
def extract_visible_text(tag, separator: ' ')
|
|
28
|
+
parts = tag.children.filter_map do |child|
|
|
29
|
+
next unless visible_child?(child)
|
|
30
|
+
|
|
31
|
+
raw_text = child.children.empty? ? child.text : extract_visible_text(child)
|
|
32
|
+
text = raw_text&.strip
|
|
33
|
+
text unless text.to_s.empty?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
parts.join(separator).squeeze(' ').strip unless parts.empty?
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def visible_child?(node)
|
|
42
|
+
!INVISIBLE_CONTENT_TAGS.include?(node.name) &&
|
|
43
|
+
!(node.name == 'a' && node['href']&.start_with?('#'))
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
##
|
|
48
|
+
# @param article_tag [Nokogiri::XML::Node] article-like container to extract from
|
|
49
|
+
# @param base_url [String, Html2rss::Url] base url used to resolve relative links
|
|
50
|
+
# @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
|
|
51
|
+
def initialize(article_tag, base_url:, selected_anchor:)
|
|
52
|
+
raise ArgumentError, 'article_tag is required' unless article_tag
|
|
53
|
+
|
|
54
|
+
@article_tag = article_tag
|
|
55
|
+
@base_url = base_url
|
|
56
|
+
@selected_anchor = selected_anchor
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def call
|
|
60
|
+
{
|
|
61
|
+
title: extract_title,
|
|
62
|
+
url: extract_url,
|
|
63
|
+
image: extract_image,
|
|
64
|
+
description: extract_description,
|
|
65
|
+
id: generate_id,
|
|
66
|
+
published_at: extract_published_at,
|
|
67
|
+
enclosures: extract_enclosures,
|
|
68
|
+
categories: extract_categories
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
attr_reader :article_tag, :base_url, :selected_anchor
|
|
75
|
+
|
|
76
|
+
class << self
|
|
77
|
+
##
|
|
78
|
+
# @param article_tag [Nokogiri::XML::Node] article-like container to search within
|
|
79
|
+
# @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
|
|
80
|
+
def main_anchor_for(article_tag)
|
|
81
|
+
return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
|
|
82
|
+
|
|
83
|
+
article_tag.at_css(MAIN_ANCHOR_SELECTOR)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def extract_url
|
|
88
|
+
@extract_url ||= begin
|
|
89
|
+
href = selected_anchor&.[]('href').to_s
|
|
90
|
+
|
|
91
|
+
Url.from_relative(href.split('#').first.strip, base_url) unless href.empty?
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def extract_title
|
|
96
|
+
title_source = heading || selected_anchor
|
|
97
|
+
self.class.extract_visible_text(title_source) if title_source
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def heading
|
|
101
|
+
@heading ||= begin
|
|
102
|
+
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
|
103
|
+
smallest_heading = heading_tags.keys.min
|
|
104
|
+
if smallest_heading
|
|
105
|
+
heading_tags[smallest_heading]&.max_by do |tag|
|
|
106
|
+
self.class.extract_visible_text(tag)&.size.to_i
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def extract_description
|
|
113
|
+
text = self.class.extract_visible_text(article_tag.css(NON_HEADLINE_SELECTOR), separator: '<br>')
|
|
114
|
+
return text if text && !text.empty?
|
|
115
|
+
|
|
116
|
+
description = self.class.extract_visible_text(article_tag)
|
|
117
|
+
return nil if description.nil? || description.strip.empty?
|
|
118
|
+
|
|
119
|
+
description.strip
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def generate_id
|
|
123
|
+
[
|
|
124
|
+
article_tag['id'],
|
|
125
|
+
article_tag.at_css('[id]')&.attr('id'),
|
|
126
|
+
extract_url&.path,
|
|
127
|
+
extract_url&.query
|
|
128
|
+
].compact.reject(&:empty?).first
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def extract_image = ImageExtractor.call(article_tag, base_url:)
|
|
132
|
+
def extract_published_at = DateExtractor.call(article_tag)
|
|
133
|
+
def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
|
|
134
|
+
def extract_categories = CategoryExtractor.call(article_tag)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# HtmlNavigator provides methods to navigate through HTML nodes.
|
|
6
|
+
class HtmlNavigator
|
|
7
|
+
class << self
|
|
8
|
+
##
|
|
9
|
+
# Returns the first parent that satisfies the condition.
|
|
10
|
+
# If the condition is met, it returns the node itself.
|
|
11
|
+
#
|
|
12
|
+
# @param node [Nokogiri::XML::Node] The node to start the search from.
|
|
13
|
+
# @param condition [Proc] The condition to be met.
|
|
14
|
+
# @return [Nokogiri::XML::Node, nil] The first parent that satisfies the condition.
|
|
15
|
+
def parent_until_condition(node, condition)
|
|
16
|
+
while node && !node.document? && node.name != 'html'
|
|
17
|
+
return node if condition.call(node)
|
|
18
|
+
|
|
19
|
+
node = node.parent
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
# Think of it as `css_upwards` method.
|
|
25
|
+
# It searches for the closest parent that matches the given selector.
|
|
26
|
+
def find_closest_selector_upwards(current_tag, selector)
|
|
27
|
+
while current_tag
|
|
28
|
+
found = current_tag.at_css(selector)
|
|
29
|
+
return found if found
|
|
30
|
+
|
|
31
|
+
return nil unless current_tag.respond_to?(:parent)
|
|
32
|
+
|
|
33
|
+
current_tag = current_tag.parent
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
##
|
|
38
|
+
# Searches for the closest parent that matches the given tag name.
|
|
39
|
+
def find_tag_in_ancestors(current_tag, tag_name)
|
|
40
|
+
return current_tag if current_tag.name == tag_name
|
|
41
|
+
|
|
42
|
+
current_tag.ancestors(tag_name).first
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class JsonFeedBuilder
|
|
5
|
+
##
|
|
6
|
+
# Maps an {Html2rss::RssBuilder::Article} to a JSONFeed 1.1 item hash.
|
|
7
|
+
class Item
|
|
8
|
+
##
|
|
9
|
+
# @param article [Html2rss::RssBuilder::Article]
|
|
10
|
+
def initialize(article)
|
|
11
|
+
@article = article
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
##
|
|
15
|
+
# @return [Hash, nil] the JSONFeed-compliant item hash
|
|
16
|
+
def to_h
|
|
17
|
+
content = content_fields
|
|
18
|
+
return if content.empty?
|
|
19
|
+
|
|
20
|
+
item_payload.merge(content).compact
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
attr_reader :article
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# @return [Hash]
|
|
29
|
+
def item_payload
|
|
30
|
+
{
|
|
31
|
+
id: article.guid,
|
|
32
|
+
url: article.url&.to_s,
|
|
33
|
+
title: article.title,
|
|
34
|
+
image: article.image&.to_s,
|
|
35
|
+
date_published: article.published_at&.iso8601,
|
|
36
|
+
authors: author_array,
|
|
37
|
+
tags:,
|
|
38
|
+
attachments:
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# @return [Array<Hash>, nil]
|
|
44
|
+
def author_array
|
|
45
|
+
return unless (name = article.author)
|
|
46
|
+
|
|
47
|
+
[{ name: }]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
##
|
|
51
|
+
# JSON Feed items must include content_html or content_text.
|
|
52
|
+
# @return [Hash]
|
|
53
|
+
def content_fields
|
|
54
|
+
description = article.description
|
|
55
|
+
return { content_html: description } if description
|
|
56
|
+
|
|
57
|
+
title = article.title
|
|
58
|
+
return { content_text: title } if title
|
|
59
|
+
|
|
60
|
+
{}
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# @return [Array<String>, nil]
|
|
65
|
+
def tags
|
|
66
|
+
cats = article.categories
|
|
67
|
+
cats.empty? ? nil : cats
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
##
|
|
71
|
+
# Maps enclosures to JSONFeed attachment objects.
|
|
72
|
+
# @return [Array<Hash>, nil]
|
|
73
|
+
def attachments
|
|
74
|
+
enclosures = article.enclosures
|
|
75
|
+
return nil if enclosures.empty?
|
|
76
|
+
|
|
77
|
+
enclosures.map { |enc| attachment_hash(enc) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
##
|
|
81
|
+
# @param enclosure [Html2rss::RssBuilder::Article::Enclosure]
|
|
82
|
+
# @return [Hash]
|
|
83
|
+
def attachment_hash(enclosure)
|
|
84
|
+
size = enclosure.bits_length
|
|
85
|
+
|
|
86
|
+
{
|
|
87
|
+
url: enclosure.url.to_s,
|
|
88
|
+
mime_type: enclosure.type,
|
|
89
|
+
size_in_bytes: size&.positive? ? size : nil
|
|
90
|
+
}.compact
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Builds a JSONFeed 1.1 hash from channel metadata and articles.
|
|
6
|
+
#
|
|
7
|
+
# @see https://www.jsonfeed.org/version/1.1/
|
|
8
|
+
class JsonFeedBuilder
|
|
9
|
+
VERSION_URL = 'https://jsonfeed.org/version/1.1'
|
|
10
|
+
|
|
11
|
+
##
|
|
12
|
+
# @param channel [Html2rss::RssBuilder::Channel]
|
|
13
|
+
# @param articles [Array<Html2rss::RssBuilder::Article>]
|
|
14
|
+
def initialize(channel:, articles:)
|
|
15
|
+
@channel = channel
|
|
16
|
+
@articles = articles
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
##
|
|
20
|
+
# Builds and returns the JSONFeed hash.
|
|
21
|
+
#
|
|
22
|
+
# @return [Hash] the JSONFeed-compliant hash
|
|
23
|
+
def call
|
|
24
|
+
base_payload.merge(authors: author_array, items: item_hashes).compact
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
attr_reader :channel, :articles
|
|
30
|
+
|
|
31
|
+
##
|
|
32
|
+
# @return [Hash]
|
|
33
|
+
def base_payload
|
|
34
|
+
{
|
|
35
|
+
version: VERSION_URL,
|
|
36
|
+
title: channel.title,
|
|
37
|
+
home_page_url: channel.url.to_s,
|
|
38
|
+
description: channel.description,
|
|
39
|
+
language: channel.language,
|
|
40
|
+
icon: channel.image&.to_s
|
|
41
|
+
}
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# @return [Array<Hash>]
|
|
46
|
+
def item_hashes
|
|
47
|
+
articles.filter_map { |article| Item.new(article).to_h }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
##
|
|
51
|
+
# @return [Array<Hash>, nil]
|
|
52
|
+
def author_array
|
|
53
|
+
return unless (name = channel.author)
|
|
54
|
+
|
|
55
|
+
[{ name: }]
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <audio> tag from a URL and type.
|
|
8
|
+
class AudioRenderer
|
|
9
|
+
def initialize(url:, type:)
|
|
10
|
+
@url = url
|
|
11
|
+
@type = type
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def to_html
|
|
15
|
+
%(<audio controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous">
|
|
16
|
+
<source src="#{escaped_url}" type="#{escaped_type}">
|
|
17
|
+
</audio>)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def escaped_url
|
|
23
|
+
CGI.escapeHTML(@url.to_s)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def escaped_type
|
|
27
|
+
CGI.escapeHTML(@type.to_s)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module Rendering
|
|
5
|
+
# Builds a sanitized article description from the base text, title, and optional media.
|
|
6
|
+
#
|
|
7
|
+
# Combines media elements (images, audio, video, PDFs) with sanitized text content
|
|
8
|
+
# to create rich RSS descriptions that reveal more scraped information.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
11
|
+
# builder = DescriptionBuilder.new(
|
|
12
|
+
# base: "Article content",
|
|
13
|
+
# title: "Article Title",
|
|
14
|
+
# url: "https://example.com",
|
|
15
|
+
# enclosures: [enclosure_object],
|
|
16
|
+
# image: "https://example.com/image.jpg"
|
|
17
|
+
# )
|
|
18
|
+
# description = builder.call
|
|
19
|
+
#
|
|
20
|
+
class DescriptionBuilder
|
|
21
|
+
# Removes the specified pattern from the beginning of the text
|
|
22
|
+
# within a given range if the pattern occurs before the range's end.
|
|
23
|
+
#
|
|
24
|
+
# @param text [String]
|
|
25
|
+
# @param pattern [String]
|
|
26
|
+
# @param end_of_range [Integer] Optional, defaults to half the text length
|
|
27
|
+
# @return [String]
|
|
28
|
+
def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
|
|
29
|
+
return text unless text.is_a?(String) && pattern.is_a?(String)
|
|
30
|
+
|
|
31
|
+
index = text.index(pattern)
|
|
32
|
+
return text if index.nil? || index >= end_of_range
|
|
33
|
+
|
|
34
|
+
text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @param base [String] The base text content for the description
|
|
38
|
+
# @param title [String] The article title (used for alt text and title removal)
|
|
39
|
+
# @param url [String, Html2rss::Url] The article URL (used for sanitization)
|
|
40
|
+
# @param enclosures [Array<Html2rss::RssBuilder::Enclosure>, nil] Media enclosures
|
|
41
|
+
# @param image [String, Html2rss::Url, nil] Fallback image URL
|
|
42
|
+
def initialize(base:, title:, url:, enclosures:, image:)
|
|
43
|
+
@base = base.to_s
|
|
44
|
+
@title = title
|
|
45
|
+
@url = url
|
|
46
|
+
@enclosures = Array(enclosures)
|
|
47
|
+
@image = image
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Generates the complete description with media and sanitized text.
|
|
51
|
+
#
|
|
52
|
+
# @return [String, nil] The complete description or nil if empty
|
|
53
|
+
def call
|
|
54
|
+
fragments = []
|
|
55
|
+
fragments.concat(Array(rendered_media))
|
|
56
|
+
fragments << processed_base_description
|
|
57
|
+
|
|
58
|
+
result = fragments.compact.join("\n\n").strip
|
|
59
|
+
result.empty? ? nil : result
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def rendered_media
|
|
65
|
+
rendered = render_enclosures
|
|
66
|
+
return rendered if rendered.any?
|
|
67
|
+
return render_fallback_image if @image
|
|
68
|
+
|
|
69
|
+
[]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def render_enclosures
|
|
73
|
+
@enclosures.filter_map do |enclosure|
|
|
74
|
+
MediaRenderer.for(enclosure:, image: @image, title: @title)&.to_html
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def render_fallback_image
|
|
79
|
+
[MediaRenderer.for(enclosure: nil, image: @image, title: @title)&.to_html]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def processed_base_description
|
|
83
|
+
text = self.class.remove_pattern_from_start(@base, @title)
|
|
84
|
+
Html2rss::Selectors::PostProcessors::SanitizeHtml.get(text, @url)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <img> tag from a URL and title.
|
|
8
|
+
class ImageRenderer
|
|
9
|
+
def initialize(url:, title:)
|
|
10
|
+
@url = url
|
|
11
|
+
@title = title
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def to_html
|
|
15
|
+
%(<img src="#{@url}"
|
|
16
|
+
alt="#{escaped_title}"
|
|
17
|
+
title="#{escaped_title}"
|
|
18
|
+
loading="lazy"
|
|
19
|
+
referrerpolicy="no-referrer"
|
|
20
|
+
decoding="async"
|
|
21
|
+
crossorigin="anonymous">).delete("\n").gsub(/\s+/, ' ')
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def escaped_title
|
|
27
|
+
CGI.escapeHTML(@title.to_s)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
module Rendering
|
|
5
|
+
# Factory: picks the appropriate renderer for a given enclosure or fallback image.
|
|
6
|
+
class MediaRenderer
|
|
7
|
+
# @param enclosure [Html2rss::RssBuilder::Enclosure, nil]
|
|
8
|
+
# @param image [String, Html2rss::Url, nil] Fallback image URL
|
|
9
|
+
# @param title [String]
|
|
10
|
+
# @return [ImageRenderer, VideoRenderer, AudioRenderer, PdfRenderer, nil]
|
|
11
|
+
def self.for(enclosure:, image:, title:)
|
|
12
|
+
return ImageRenderer.new(url: image, title:) if enclosure.nil? && image
|
|
13
|
+
return nil unless enclosure
|
|
14
|
+
|
|
15
|
+
create_renderer_for_type(enclosure.type, url: enclosure.url, title:)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @private
|
|
19
|
+
def self.create_renderer_for_type(type, url:, title:)
|
|
20
|
+
case type
|
|
21
|
+
when %r{^image/}
|
|
22
|
+
ImageRenderer.new(url:, title:)
|
|
23
|
+
when %r{^video/}
|
|
24
|
+
VideoRenderer.new(url:, type:)
|
|
25
|
+
when %r{^audio/}
|
|
26
|
+
AudioRenderer.new(url:, type:)
|
|
27
|
+
when 'application/pdf'
|
|
28
|
+
PdfRenderer.new(url:)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <iframe> for PDF documents.
|
|
8
|
+
class PdfRenderer
|
|
9
|
+
def initialize(url:)
|
|
10
|
+
@url = url
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_html
|
|
14
|
+
%(<iframe src="#{escaped_url}" width="100%" height="75vh"
|
|
15
|
+
sandbox=""
|
|
16
|
+
referrerpolicy="no-referrer"
|
|
17
|
+
loading="lazy">
|
|
18
|
+
</iframe>)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def escaped_url
|
|
24
|
+
CGI.escapeHTML(@url.to_s)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|