html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'reverse_markdown'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Selectors
|
|
7
|
+
module PostProcessors
|
|
8
|
+
##
|
|
9
|
+
# Returns HTML code as Markdown formatted String.
|
|
10
|
+
# Before converting to markdown, the HTML is sanitized with SanitizeHtml.
|
|
11
|
+
# Imagine this HTML structure:
|
|
12
|
+
#
|
|
13
|
+
# <section>
|
|
14
|
+
# Lorem <b>ipsum</b> dolor...
|
|
15
|
+
# <iframe src="https://evil.corp/miner"></iframe>
|
|
16
|
+
# <script>alert();</script>
|
|
17
|
+
# </section>
|
|
18
|
+
#
|
|
19
|
+
# YAML usage example:
|
|
20
|
+
#
|
|
21
|
+
# selectors:
|
|
22
|
+
# description:
|
|
23
|
+
# selector: section
|
|
24
|
+
# extractor: html
|
|
25
|
+
# post_process:
|
|
26
|
+
# name: html_to_markdown
|
|
27
|
+
#
|
|
28
|
+
# Would return:
|
|
29
|
+
# 'Lorem **ipsum** dolor'
|
|
30
|
+
class HtmlToMarkdown < Base
|
|
31
|
+
# @param value [String] extracted selector value
|
|
32
|
+
# @param context [Selectors::Context] post-processor context
|
|
33
|
+
# @return [void]
|
|
34
|
+
def self.validate_args!(value, context)
|
|
35
|
+
assert_type value, String, :value, context:
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# @return [String] formatted in Markdown
|
|
40
|
+
def get
|
|
41
|
+
sanitized_value = SanitizeHtml.new(value, context).get
|
|
42
|
+
|
|
43
|
+
ReverseMarkdown.convert(sanitized_value)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module PostProcessors
|
|
6
|
+
# HTML tree transformers used by selectors post-processing.
|
|
7
|
+
module HtmlTransformers
|
|
8
|
+
##
|
|
9
|
+
# Transformer that converts relative URLs to absolute URLs within specified HTML elements.
|
|
10
|
+
class TransformUrlsToAbsoluteOnes
|
|
11
|
+
# HTML tags and the URL-bearing attribute that should be normalized.
|
|
12
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
|
|
13
|
+
'a' => :href, # Visible link
|
|
14
|
+
'img' => :src, # Visible image
|
|
15
|
+
'iframe' => :src, # Embedded frame (visible content)
|
|
16
|
+
'audio' => :src, # Can show controls, so potentially visible
|
|
17
|
+
'video' => :src # Video player is visible
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
20
|
+
# @param channel_url [String, Html2rss::Url] base URL used to resolve relative links
|
|
21
|
+
def initialize(channel_url)
|
|
22
|
+
@channel_url = channel_url
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
##
|
|
26
|
+
# Transforms URLs to absolute ones.
|
|
27
|
+
#
|
|
28
|
+
# @param node_name [String] node name currently being transformed
|
|
29
|
+
# @param node [Nokogiri::XML::Node] node currently being transformed
|
|
30
|
+
# @param _env [Hash] transformer context
|
|
31
|
+
# @option _env [Object] :_reserved reserved for transformer pipeline context
|
|
32
|
+
# @return [void]
|
|
33
|
+
def call(node_name:, node:, **_env)
|
|
34
|
+
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
|
|
35
|
+
|
|
36
|
+
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
|
|
37
|
+
url = node[url_attribute]
|
|
38
|
+
node[url_attribute] = Url.from_relative(url, @channel_url).to_s
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module PostProcessors
|
|
6
|
+
module HtmlTransformers
|
|
7
|
+
##
|
|
8
|
+
# Transformer that wraps <img> tags into <a> tags linking to `img.src`.
|
|
9
|
+
class WrapImgInA
|
|
10
|
+
##
|
|
11
|
+
# Wraps <img> tags into <a> tags that link to `img.src`.
|
|
12
|
+
#
|
|
13
|
+
# @param node_name [String]
|
|
14
|
+
# @param node [Nokogiri::XML::Node]
|
|
15
|
+
# @param _env [Hash] transformer context
|
|
16
|
+
# @option _env [Object] :_reserved reserved for transformer pipeline context
|
|
17
|
+
# @return [nil]
|
|
18
|
+
def call(node_name:, node:, **_env)
|
|
19
|
+
return unless should_process?(node_name)
|
|
20
|
+
|
|
21
|
+
wrap_image_in_anchor(node) unless already_wrapped?(node)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @param node_name [String] node name currently being transformed
|
|
25
|
+
# @return [Boolean] whether this transformer should run for the node
|
|
26
|
+
def should_process?(node_name)
|
|
27
|
+
node_name == 'img'
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @param node [Nokogiri::XML::Node] node currently being transformed
|
|
31
|
+
# @return [Boolean] whether the image is already wrapped in a link
|
|
32
|
+
def already_wrapped?(node)
|
|
33
|
+
node.parent.name == 'a'
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# Wraps the <img> node in an <a> tag.
|
|
40
|
+
#
|
|
41
|
+
# @param node [Nokogiri::XML::Node]
|
|
42
|
+
# @return [nil]
|
|
43
|
+
def wrap_image_in_anchor(node)
|
|
44
|
+
anchor = Nokogiri::XML::Node.new('a', node.document)
|
|
45
|
+
anchor['href'] = node['src']
|
|
46
|
+
node.add_next_sibling(anchor)
|
|
47
|
+
anchor.add_child(node.remove)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'kramdown'
|
|
4
|
+
require_relative 'sanitize_html'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
class Selectors
|
|
8
|
+
module PostProcessors
|
|
9
|
+
##
|
|
10
|
+
# Generates HTML from Markdown.
|
|
11
|
+
#
|
|
12
|
+
# It's particularly useful in conjunction with the Template post processor
|
|
13
|
+
# to generate a description from other selectors.
|
|
14
|
+
#
|
|
15
|
+
# YAML usage example:
|
|
16
|
+
#
|
|
17
|
+
# selectors:
|
|
18
|
+
# description:
|
|
19
|
+
# selector: section
|
|
20
|
+
# post_process:
|
|
21
|
+
# - name: template
|
|
22
|
+
# string: |
|
|
23
|
+
# # %s
|
|
24
|
+
#
|
|
25
|
+
# Price: %s
|
|
26
|
+
# methods:
|
|
27
|
+
# - self
|
|
28
|
+
# - price
|
|
29
|
+
# - name: markdown_to_html
|
|
30
|
+
#
|
|
31
|
+
# Would e.g. return:
|
|
32
|
+
#
|
|
33
|
+
# <h1>Section</h1>
|
|
34
|
+
#
|
|
35
|
+
# <p>Price: 12.34</p>
|
|
36
|
+
class MarkdownToHtml < Base
|
|
37
|
+
# @param value [String] extracted selector value
|
|
38
|
+
# @param context [Selectors::Context] post-processor context
|
|
39
|
+
# @return [void]
|
|
40
|
+
def self.validate_args!(value, context)
|
|
41
|
+
assert_type value, String, :value, context:
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# Converts Markdown to sanitized HTML.
|
|
46
|
+
#
|
|
47
|
+
# @return [String] Sanitized HTML content
|
|
48
|
+
def get
|
|
49
|
+
html_content = Kramdown::Document.new(value).to_html
|
|
50
|
+
SanitizeHtml.new(html_content, context).get
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'time'
|
|
4
|
+
require 'tzinfo'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
class Selectors
|
|
8
|
+
module PostProcessors
|
|
9
|
+
##
|
|
10
|
+
# Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
|
|
11
|
+
#
|
|
12
|
+
# Imagine this HTML structure:
|
|
13
|
+
#
|
|
14
|
+
# <p>Published on <span>2019-07-02</span></p>
|
|
15
|
+
#
|
|
16
|
+
# YAML usage example:
|
|
17
|
+
#
|
|
18
|
+
# selectors:
|
|
19
|
+
# description:
|
|
20
|
+
# selector: span
|
|
21
|
+
# post_process:
|
|
22
|
+
# name: 'parse_time'
|
|
23
|
+
# time_zone: 'Europe/Berlin'
|
|
24
|
+
#
|
|
25
|
+
# Would return:
|
|
26
|
+
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
|
27
|
+
#
|
|
28
|
+
# It uses `Time.parse`.
|
|
29
|
+
class ParseTime < Base
|
|
30
|
+
# @param value [String] extracted selector value
|
|
31
|
+
# @param context [Selectors::Context] post-processor context
|
|
32
|
+
# @return [void]
|
|
33
|
+
def self.validate_args!(value, context)
|
|
34
|
+
assert_type(value, String, :value, context:)
|
|
35
|
+
time_zone_value = time_zone(context)
|
|
36
|
+
|
|
37
|
+
if time_zone_value.nil? || time_zone_value.empty?
|
|
38
|
+
raise ArgumentError, 'time_zone cannot be nil or empty', [], cause: nil
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
assert_type(time_zone_value, String, :time_zone, context:)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @param context [Selectors::Context] post-processor context
|
|
45
|
+
# @return [String, nil] configured channel time zone
|
|
46
|
+
def self.time_zone(context) = context.dig(:config, :channel, :time_zone)
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
|
50
|
+
#
|
|
51
|
+
# @return [String] RFC822 formatted time
|
|
52
|
+
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
|
53
|
+
def get
|
|
54
|
+
with_timezone(time_zone) { Time.parse(value).rfc822 }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def time_zone
|
|
60
|
+
self.class.time_zone(context)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def with_timezone(time_zone)
|
|
64
|
+
return yield if time_zone.nil? || time_zone.empty?
|
|
65
|
+
|
|
66
|
+
# Validate timezone using TZInfo
|
|
67
|
+
TZInfo::Timezone.get(time_zone)
|
|
68
|
+
|
|
69
|
+
prev_tz = ENV.fetch('TZ', Time.now.getlocal.zone)
|
|
70
|
+
ENV['TZ'] = time_zone
|
|
71
|
+
yield
|
|
72
|
+
ensure
|
|
73
|
+
ENV['TZ'] = prev_tz if prev_tz
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module PostProcessors
|
|
6
|
+
##
|
|
7
|
+
# Returns the normalized URL as a String.
|
|
8
|
+
# If the URL is relative, it resolves it against the channel URL.
|
|
9
|
+
#
|
|
10
|
+
# Imagine this HTML structure:
|
|
11
|
+
#
|
|
12
|
+
# <span>http://why-not-use-a-link.uh </span>
|
|
13
|
+
#
|
|
14
|
+
# YAML usage example:
|
|
15
|
+
#
|
|
16
|
+
# selectors:
|
|
17
|
+
# link:
|
|
18
|
+
# selector: span
|
|
19
|
+
# extractor: text
|
|
20
|
+
# post_process:
|
|
21
|
+
# name: parse_uri
|
|
22
|
+
#
|
|
23
|
+
# Would return:
|
|
24
|
+
# 'http://why-not-use-a-link.uh'
|
|
25
|
+
class ParseUri < Base
|
|
26
|
+
# @param value [String] extracted selector value
|
|
27
|
+
# @param _context [Selectors::Context] post-processor context
|
|
28
|
+
# @return [void]
|
|
29
|
+
def self.validate_args!(value, _context)
|
|
30
|
+
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
##
|
|
34
|
+
# @return [String]
|
|
35
|
+
def get
|
|
36
|
+
config_url = context.dig(:config, :channel, :url)
|
|
37
|
+
|
|
38
|
+
Url.from_relative(value, config_url).to_s
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'sanitize'
|
|
4
|
+
require_relative 'html_transformers/transform_urls_to_absolute_ones'
|
|
5
|
+
require_relative 'html_transformers/wrap_img_in_a'
|
|
6
|
+
|
|
7
|
+
module Html2rss
|
|
8
|
+
class Selectors
|
|
9
|
+
module PostProcessors
|
|
10
|
+
##
|
|
11
|
+
# Returns sanitized HTML code as String.
|
|
12
|
+
#
|
|
13
|
+
# It sanitizes by using the [sanitize gem](https://github.com/rgrove/sanitize) with
|
|
14
|
+
# [Sanitize::Config::RELAXED](https://github.com/rgrove/sanitize#sanitizeconfigrelaxed).
|
|
15
|
+
#
|
|
16
|
+
# Furthermore, it adds:
|
|
17
|
+
#
|
|
18
|
+
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
|
19
|
+
# - `referrer-policy='no-referrer'` to <img> tags
|
|
20
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
|
21
|
+
# linking to the <img>'s `src`.
|
|
22
|
+
#
|
|
23
|
+
# Imagine this HTML structure:
|
|
24
|
+
#
|
|
25
|
+
# <section>
|
|
26
|
+
# Lorem <b>ipsum</b> dolor...
|
|
27
|
+
# <iframe src="https://evil.corp/miner"></iframe>
|
|
28
|
+
# <script>alert();</script>
|
|
29
|
+
# </section>
|
|
30
|
+
#
|
|
31
|
+
# YAML usage example:
|
|
32
|
+
#
|
|
33
|
+
# selectors:
|
|
34
|
+
# description:
|
|
35
|
+
# selector: '.section'
|
|
36
|
+
# extractor: html
|
|
37
|
+
# post_process:
|
|
38
|
+
# name: sanitize_html
|
|
39
|
+
#
|
|
40
|
+
# Would return:
|
|
41
|
+
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
|
42
|
+
class SanitizeHtml < Base
|
|
43
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
|
|
44
|
+
TAG_ATTRIBUTES = {
|
|
45
|
+
'a' => {
|
|
46
|
+
'rel' => 'nofollow noopener noreferrer',
|
|
47
|
+
'target' => '_blank'
|
|
48
|
+
},
|
|
49
|
+
|
|
50
|
+
'area' => {
|
|
51
|
+
'rel' => 'nofollow noopener noreferrer',
|
|
52
|
+
'target' => '_blank'
|
|
53
|
+
},
|
|
54
|
+
|
|
55
|
+
'img' => {
|
|
56
|
+
'referrerpolicy' => 'no-referrer',
|
|
57
|
+
'crossorigin' => 'anonymous',
|
|
58
|
+
'loading' => 'lazy',
|
|
59
|
+
'decoding' => 'async'
|
|
60
|
+
},
|
|
61
|
+
|
|
62
|
+
'iframe' => {
|
|
63
|
+
'referrerpolicy' => 'no-referrer',
|
|
64
|
+
'crossorigin' => 'anonymous',
|
|
65
|
+
'loading' => 'lazy',
|
|
66
|
+
'sandbox' => 'allow-same-origin',
|
|
67
|
+
'src' => true,
|
|
68
|
+
'width' => true,
|
|
69
|
+
'height' => true
|
|
70
|
+
},
|
|
71
|
+
|
|
72
|
+
'video' => {
|
|
73
|
+
'referrerpolicy' => 'no-referrer',
|
|
74
|
+
'crossorigin' => 'anonymous',
|
|
75
|
+
'preload' => 'none',
|
|
76
|
+
'playsinline' => 'true',
|
|
77
|
+
'controls' => 'true'
|
|
78
|
+
},
|
|
79
|
+
|
|
80
|
+
'audio' => {
|
|
81
|
+
'referrerpolicy' => 'no-referrer',
|
|
82
|
+
'crossorigin' => 'anonymous',
|
|
83
|
+
'preload' => 'none'
|
|
84
|
+
}
|
|
85
|
+
}.freeze
|
|
86
|
+
# @param value [String] extracted selector value
|
|
87
|
+
# @param context [Selectors::Context] post-processor context
|
|
88
|
+
# @return [void]
|
|
89
|
+
def self.validate_args!(value, context)
|
|
90
|
+
assert_type value, String, :value, context:
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
##
|
|
94
|
+
# Shorthand method to get the sanitized HTML.
|
|
95
|
+
# @param html [String]
|
|
96
|
+
# @param url [String, Html2rss::Url]
|
|
97
|
+
# @return [String, nil]
|
|
98
|
+
def self.get(html, url)
|
|
99
|
+
return nil if String(html).empty?
|
|
100
|
+
|
|
101
|
+
context = Selectors::Context.new(config: { channel: { url: } }, options: {})
|
|
102
|
+
new(html, context).get
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
##
|
|
106
|
+
# @return [String, nil]
|
|
107
|
+
def get
|
|
108
|
+
sanitized_html = Sanitize.fragment(value, sanitize_config).to_s
|
|
109
|
+
sanitized_html.gsub!(/\s+/, ' ')
|
|
110
|
+
sanitized_html.strip!
|
|
111
|
+
sanitized_html.empty? ? nil : sanitized_html
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
def channel_url = context.dig(:config, :channel, :url)
|
|
117
|
+
|
|
118
|
+
##
|
|
119
|
+
# @return [Sanitize::Config]
|
|
120
|
+
def sanitize_config # rubocop:disable Metrics/MethodLength
|
|
121
|
+
config = Sanitize::Config.merge(
|
|
122
|
+
Sanitize::Config::RELAXED,
|
|
123
|
+
attributes: { all: %w[dir lang alt title translate] },
|
|
124
|
+
add_attributes: TAG_ATTRIBUTES,
|
|
125
|
+
transformers: [
|
|
126
|
+
method(:transform_urls_to_absolute_ones),
|
|
127
|
+
method(:wrap_img_in_a)
|
|
128
|
+
]
|
|
129
|
+
)
|
|
130
|
+
config[:elements].push('audio', 'video', 'source')
|
|
131
|
+
config
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
##
|
|
135
|
+
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
|
136
|
+
#
|
|
137
|
+
# @param env [Hash]
|
|
138
|
+
# @return [nil]
|
|
139
|
+
def transform_urls_to_absolute_ones(env)
|
|
140
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
##
|
|
144
|
+
# Wrapper for wrap_img_in_a.
|
|
145
|
+
#
|
|
146
|
+
# @param env [Hash]
|
|
147
|
+
# @return [nil]
|
|
148
|
+
def wrap_img_in_a(env)
|
|
149
|
+
HtmlTransformers::WrapImgInA.new.call(**env)
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module PostProcessors
|
|
6
|
+
##
|
|
7
|
+
# Returns a defined part of a String.
|
|
8
|
+
#
|
|
9
|
+
# Both parameters must be an Integer and they can be negative.
|
|
10
|
+
# The +end+ parameter can be omitted, in that case it will not cut the
|
|
11
|
+
# String at the end.
|
|
12
|
+
#
|
|
13
|
+
# A Regexp or a MatchString is not supported.
|
|
14
|
+
#
|
|
15
|
+
# See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
|
|
16
|
+
# documentation for more information.
|
|
17
|
+
#
|
|
18
|
+
# Imagine this HTML:
|
|
19
|
+
# <h1>Foo bar and baz<h1>
|
|
20
|
+
#
|
|
21
|
+
# YAML usage example:
|
|
22
|
+
# selectors:
|
|
23
|
+
# title:
|
|
24
|
+
# selector: h1
|
|
25
|
+
# post_process:
|
|
26
|
+
# name: substring
|
|
27
|
+
# start: 4
|
|
28
|
+
# end: 6
|
|
29
|
+
#
|
|
30
|
+
# Would return:
|
|
31
|
+
# 'bar'
|
|
32
|
+
class Substring < Base
|
|
33
|
+
# @param value [String] extracted selector value
|
|
34
|
+
# @param context [Selectors::Context] post-processor context
|
|
35
|
+
# @return [void]
|
|
36
|
+
def self.validate_args!(value, context)
|
|
37
|
+
assert_type value, String, :value, context:
|
|
38
|
+
|
|
39
|
+
options = context[:options]
|
|
40
|
+
assert_type options[:start], Integer, :start, context:
|
|
41
|
+
|
|
42
|
+
end_index = options[:end]
|
|
43
|
+
assert_type(end_index, Integer, :end, context:) if end_index
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
##
|
|
47
|
+
# Extracts the substring from the original string based on the provided start and end indices.
|
|
48
|
+
#
|
|
49
|
+
# @return [String] The extracted substring.
|
|
50
|
+
def get
|
|
51
|
+
value[range]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
##
|
|
55
|
+
# Determines the range for the substring extraction based on the provided start and end indices.
|
|
56
|
+
#
|
|
57
|
+
# @return [Range] The range object representing the start and end/Infinity (integers).
|
|
58
|
+
def range
|
|
59
|
+
return (start_index..) unless end_index?
|
|
60
|
+
|
|
61
|
+
if start_index == end_index
|
|
62
|
+
raise ArgumentError,
|
|
63
|
+
'The `start` value must be unequal to the `end` value.'
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
(start_index..end_index)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
private
|
|
70
|
+
|
|
71
|
+
def end_index? = !context[:options][:end].to_s.empty?
|
|
72
|
+
def end_index = context[:options][:end].to_i
|
|
73
|
+
def start_index = context[:options][:start].to_i
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module PostProcessors
|
|
6
|
+
##
|
|
7
|
+
# Returns a formatted String according to the string pattern.
|
|
8
|
+
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
|
9
|
+
#
|
|
10
|
+
# It supports the format pattern `%<key>s` and `%{key}`, where `key` is the key of the selector.
|
|
11
|
+
# If `%{self}` is used, the selectors extracted value will be used.
|
|
12
|
+
#
|
|
13
|
+
# Imagine this HTML:
|
|
14
|
+
#
|
|
15
|
+
# <li>
|
|
16
|
+
# <h1>Product</h1>
|
|
17
|
+
# <span class="price">23,42€</span>
|
|
18
|
+
# </li>
|
|
19
|
+
#
|
|
20
|
+
#
|
|
21
|
+
# YAML usage example:
|
|
22
|
+
#
|
|
23
|
+
# selectors:
|
|
24
|
+
# items:
|
|
25
|
+
# selector: 'li'
|
|
26
|
+
# price:
|
|
27
|
+
# selector: '.price'
|
|
28
|
+
# title:
|
|
29
|
+
# selector: h1
|
|
30
|
+
# post_process:
|
|
31
|
+
# name: template
|
|
32
|
+
# string: '%{self} (%{price})'
|
|
33
|
+
#
|
|
34
|
+
# Would return:
|
|
35
|
+
# 'Product (23,42€)'
|
|
36
|
+
class Template < Base
|
|
37
|
+
# @param value [String] extracted selector value
|
|
38
|
+
# @param context [Selectors::Context] post-processor context
|
|
39
|
+
# @return [void]
|
|
40
|
+
def self.validate_args!(value, context)
|
|
41
|
+
assert_type value, String, :value, context:
|
|
42
|
+
|
|
43
|
+
string = context[:options]&.dig(:string).to_s
|
|
44
|
+
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
##
|
|
48
|
+
# @param value [String]
|
|
49
|
+
# @param context [Selectors::Context]
|
|
50
|
+
def initialize(value, context)
|
|
51
|
+
super
|
|
52
|
+
|
|
53
|
+
@options = context[:options] || {}
|
|
54
|
+
@scraper = context[:scraper]
|
|
55
|
+
@item = context[:item]
|
|
56
|
+
@string = @options[:string].to_s
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
##
|
|
60
|
+
# @return [String]
|
|
61
|
+
def get
|
|
62
|
+
Html2rss::Config::DynamicParams.call(@string, {}, getter: method(:item_value), replace_missing_with: '')
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
# @param key [String, Symbol]
|
|
68
|
+
# @return [String]
|
|
69
|
+
def item_value(key)
|
|
70
|
+
key = key.to_sym
|
|
71
|
+
key == :self ? value : @scraper.select(key, @item)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
##
|
|
6
|
+
# Provides a namespace for attribute post processors.
|
|
7
|
+
module PostProcessors
|
|
8
|
+
##
|
|
9
|
+
# Error raised when an unknown post processor name is requested.
|
|
10
|
+
class UnknownPostProcessorName < Html2rss::Error; end
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
# Error raised when a required option is missing.
|
|
14
|
+
class MissingOption < Html2rss::Error; end
|
|
15
|
+
|
|
16
|
+
##
|
|
17
|
+
# Error raised when an invalid type is provided.
|
|
18
|
+
class InvalidType < Html2rss::Error; end
|
|
19
|
+
|
|
20
|
+
##
|
|
21
|
+
# Maps the post processor name to the class implementing the post processor.
|
|
22
|
+
#
|
|
23
|
+
# The key is the name to use in the feed config.
|
|
24
|
+
NAME_TO_CLASS = {
|
|
25
|
+
gsub: Gsub,
|
|
26
|
+
html_to_markdown: HtmlToMarkdown,
|
|
27
|
+
markdown_to_html: MarkdownToHtml,
|
|
28
|
+
parse_time: ParseTime,
|
|
29
|
+
parse_uri: ParseUri,
|
|
30
|
+
sanitize_html: SanitizeHtml,
|
|
31
|
+
substring: Substring,
|
|
32
|
+
template: Template
|
|
33
|
+
}.freeze
|
|
34
|
+
|
|
35
|
+
##
|
|
36
|
+
# Shorthand method to instantiate the post processor and call `#get` on it
|
|
37
|
+
#
|
|
38
|
+
# @param name [String, Symbol] post-processor name from selector config
|
|
39
|
+
# @param value [Object] extracted selector value
|
|
40
|
+
# @param context [Selectors::Context] post-processor context
|
|
41
|
+
# @return [Object] transformed selector value
|
|
42
|
+
def self.get(name, value, context)
|
|
43
|
+
klass = NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Unknown name '#{name}'")
|
|
44
|
+
klass.new(value, context).get
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|