html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'reverse_markdown'
4
+
5
+ module Html2rss
6
+ class Selectors
7
+ module PostProcessors
8
+ ##
9
+ # Returns HTML code as Markdown formatted String.
10
+ # Before converting to markdown, the HTML is sanitized with SanitizeHtml.
11
+ # Imagine this HTML structure:
12
+ #
13
+ # <section>
14
+ # Lorem <b>ipsum</b> dolor...
15
+ # <iframe src="https://evil.corp/miner"></iframe>
16
+ # <script>alert();</script>
17
+ # </section>
18
+ #
19
+ # YAML usage example:
20
+ #
21
+ # selectors:
22
+ # description:
23
+ # selector: section
24
+ # extractor: html
25
+ # post_process:
26
+ # name: html_to_markdown
27
+ #
28
+ # Would return:
29
+ # 'Lorem **ipsum** dolor'
30
+ class HtmlToMarkdown < Base
31
+ # @param value [String] extracted selector value
32
+ # @param context [Selectors::Context] post-processor context
33
+ # @return [void]
34
+ def self.validate_args!(value, context)
35
+ assert_type value, String, :value, context:
36
+ end
37
+
38
+ ##
39
+ # @return [String] formatted in Markdown
40
+ def get
41
+ sanitized_value = SanitizeHtml.new(value, context).get
42
+
43
+ ReverseMarkdown.convert(sanitized_value)
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ # HTML tree transformers used by selectors post-processing.
7
+ module HtmlTransformers
8
+ ##
9
+ # Transformer that converts relative URLs to absolute URLs within specified HTML elements.
10
+ class TransformUrlsToAbsoluteOnes
11
+ # HTML tags and the URL-bearing attribute that should be normalized.
12
+ URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
13
+ 'a' => :href, # Visible link
14
+ 'img' => :src, # Visible image
15
+ 'iframe' => :src, # Embedded frame (visible content)
16
+ 'audio' => :src, # Can show controls, so potentially visible
17
+ 'video' => :src # Video player is visible
18
+ }.freeze
19
+
20
+ # @param channel_url [String, Html2rss::Url] base URL used to resolve relative links
21
+ def initialize(channel_url)
22
+ @channel_url = channel_url
23
+ end
24
+
25
+ ##
26
+ # Transforms URLs to absolute ones.
27
+ #
28
+ # @param node_name [String] node name currently being transformed
29
+ # @param node [Nokogiri::XML::Node] node currently being transformed
30
+ # @param _env [Hash] transformer context
31
+ # @option _env [Object] :_reserved reserved for transformer pipeline context
32
+ # @return [void]
33
+ def call(node_name:, node:, **_env)
34
+ return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
35
+
36
+ url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
37
+ url = node[url_attribute]
38
+ node[url_attribute] = Url.from_relative(url, @channel_url).to_s
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ module HtmlTransformers
7
+ ##
8
+ # Transformer that wraps <img> tags into <a> tags linking to `img.src`.
9
+ class WrapImgInA
10
+ ##
11
+ # Wraps <img> tags into <a> tags that link to `img.src`.
12
+ #
13
+ # @param node_name [String]
14
+ # @param node [Nokogiri::XML::Node]
15
+ # @param _env [Hash] transformer context
16
+ # @option _env [Object] :_reserved reserved for transformer pipeline context
17
+ # @return [nil]
18
+ def call(node_name:, node:, **_env)
19
+ return unless should_process?(node_name)
20
+
21
+ wrap_image_in_anchor(node) unless already_wrapped?(node)
22
+ end
23
+
24
+ # @param node_name [String] node name currently being transformed
25
+ # @return [Boolean] whether this transformer should run for the node
26
+ def should_process?(node_name)
27
+ node_name == 'img'
28
+ end
29
+
30
+ # @param node [Nokogiri::XML::Node] node currently being transformed
31
+ # @return [Boolean] whether the image is already wrapped in a link
32
+ def already_wrapped?(node)
33
+ node.parent.name == 'a'
34
+ end
35
+
36
+ private
37
+
38
+ ##
39
+ # Wraps the <img> node in an <a> tag.
40
+ #
41
+ # @param node [Nokogiri::XML::Node]
42
+ # @return [nil]
43
+ def wrap_image_in_anchor(node)
44
+ anchor = Nokogiri::XML::Node.new('a', node.document)
45
+ anchor['href'] = node['src']
46
+ node.add_next_sibling(anchor)
47
+ anchor.add_child(node.remove)
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'kramdown'
4
+ require_relative 'sanitize_html'
5
+
6
+ module Html2rss
7
+ class Selectors
8
+ module PostProcessors
9
+ ##
10
+ # Generates HTML from Markdown.
11
+ #
12
+ # It's particularly useful in conjunction with the Template post processor
13
+ # to generate a description from other selectors.
14
+ #
15
+ # YAML usage example:
16
+ #
17
+ # selectors:
18
+ # description:
19
+ # selector: section
20
+ # post_process:
21
+ # - name: template
22
+ # string: |
23
+ # # %s
24
+ #
25
+ # Price: %s
26
+ # methods:
27
+ # - self
28
+ # - price
29
+ # - name: markdown_to_html
30
+ #
31
+ # Would e.g. return:
32
+ #
33
+ # <h1>Section</h1>
34
+ #
35
+ # <p>Price: 12.34</p>
36
+ class MarkdownToHtml < Base
37
+ # @param value [String] extracted selector value
38
+ # @param context [Selectors::Context] post-processor context
39
+ # @return [void]
40
+ def self.validate_args!(value, context)
41
+ assert_type value, String, :value, context:
42
+ end
43
+
44
+ ##
45
+ # Converts Markdown to sanitized HTML.
46
+ #
47
+ # @return [String] Sanitized HTML content
48
+ def get
49
+ html_content = Kramdown::Document.new(value).to_html
50
+ SanitizeHtml.new(html_content, context).get
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'time'
4
+ require 'tzinfo'
5
+
6
+ module Html2rss
7
+ class Selectors
8
+ module PostProcessors
9
+ ##
10
+ # Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
11
+ #
12
+ # Imagine this HTML structure:
13
+ #
14
+ # <p>Published on <span>2019-07-02</span></p>
15
+ #
16
+ # YAML usage example:
17
+ #
18
+ # selectors:
19
+ # description:
20
+ # selector: span
21
+ # post_process:
22
+ # name: 'parse_time'
23
+ # time_zone: 'Europe/Berlin'
24
+ #
25
+ # Would return:
26
+ # "Tue, 02 Jul 2019 00:00:00 +0200"
27
+ #
28
+ # It uses `Time.parse`.
29
+ class ParseTime < Base
30
+ # @param value [String] extracted selector value
31
+ # @param context [Selectors::Context] post-processor context
32
+ # @return [void]
33
+ def self.validate_args!(value, context)
34
+ assert_type(value, String, :value, context:)
35
+ time_zone_value = time_zone(context)
36
+
37
+ if time_zone_value.nil? || time_zone_value.empty?
38
+ raise ArgumentError, 'time_zone cannot be nil or empty', [], cause: nil
39
+ end
40
+
41
+ assert_type(time_zone_value, String, :time_zone, context:)
42
+ end
43
+
44
+ # @param context [Selectors::Context] post-processor context
45
+ # @return [String, nil] configured channel time zone
46
+ def self.time_zone(context) = context.dig(:config, :channel, :time_zone)
47
+
48
+ ##
49
+ # Converts the provided time string to RFC822 format, taking into account the time_zone.
50
+ #
51
+ # @return [String] RFC822 formatted time
52
+ # @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
53
+ def get
54
+ with_timezone(time_zone) { Time.parse(value).rfc822 }
55
+ end
56
+
57
+ private
58
+
59
+ def time_zone
60
+ self.class.time_zone(context)
61
+ end
62
+
63
+ def with_timezone(time_zone)
64
+ return yield if time_zone.nil? || time_zone.empty?
65
+
66
+ # Validate timezone using TZInfo
67
+ TZInfo::Timezone.get(time_zone)
68
+
69
+ prev_tz = ENV.fetch('TZ', Time.now.getlocal.zone)
70
+ ENV['TZ'] = time_zone
71
+ yield
72
+ ensure
73
+ ENV['TZ'] = prev_tz if prev_tz
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ ##
7
+ # Returns the normalized URL as a String.
8
+ # If the URL is relative, it resolves it against the channel URL.
9
+ #
10
+ # Imagine this HTML structure:
11
+ #
12
+ # <span>http://why-not-use-a-link.uh </span>
13
+ #
14
+ # YAML usage example:
15
+ #
16
+ # selectors:
17
+ # link:
18
+ # selector: span
19
+ # extractor: text
20
+ # post_process:
21
+ # name: parse_uri
22
+ #
23
+ # Would return:
24
+ # 'http://why-not-use-a-link.uh'
25
+ class ParseUri < Base
26
+ # @param value [String] extracted selector value
27
+ # @param _context [Selectors::Context] post-processor context
28
+ # @return [void]
29
+ def self.validate_args!(value, _context)
30
+ raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
31
+ end
32
+
33
+ ##
34
+ # @return [String]
35
+ def get
36
+ config_url = context.dig(:config, :channel, :url)
37
+
38
+ Url.from_relative(value, config_url).to_s
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sanitize'
4
+ require_relative 'html_transformers/transform_urls_to_absolute_ones'
5
+ require_relative 'html_transformers/wrap_img_in_a'
6
+
7
+ module Html2rss
8
+ class Selectors
9
+ module PostProcessors
10
+ ##
11
+ # Returns sanitized HTML code as String.
12
+ #
13
+ # It sanitizes by using the [sanitize gem](https://github.com/rgrove/sanitize) with
14
+ # [Sanitize::Config::RELAXED](https://github.com/rgrove/sanitize#sanitizeconfigrelaxed).
15
+ #
16
+ # Furthermore, it adds:
17
+ #
18
+ # - `rel="nofollow noopener noreferrer"` to <a> tags
19
+ # - `referrer-policy='no-referrer'` to <img> tags
20
+ # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
21
+ # linking to the <img>'s `src`.
22
+ #
23
+ # Imagine this HTML structure:
24
+ #
25
+ # <section>
26
+ # Lorem <b>ipsum</b> dolor...
27
+ # <iframe src="https://evil.corp/miner"></iframe>
28
+ # <script>alert();</script>
29
+ # </section>
30
+ #
31
+ # YAML usage example:
32
+ #
33
+ # selectors:
34
+ # description:
35
+ # selector: '.section'
36
+ # extractor: html
37
+ # post_process:
38
+ # name: sanitize_html
39
+ #
40
+ # Would return:
41
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
42
+ class SanitizeHtml < Base
43
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
44
+ TAG_ATTRIBUTES = {
45
+ 'a' => {
46
+ 'rel' => 'nofollow noopener noreferrer',
47
+ 'target' => '_blank'
48
+ },
49
+
50
+ 'area' => {
51
+ 'rel' => 'nofollow noopener noreferrer',
52
+ 'target' => '_blank'
53
+ },
54
+
55
+ 'img' => {
56
+ 'referrerpolicy' => 'no-referrer',
57
+ 'crossorigin' => 'anonymous',
58
+ 'loading' => 'lazy',
59
+ 'decoding' => 'async'
60
+ },
61
+
62
+ 'iframe' => {
63
+ 'referrerpolicy' => 'no-referrer',
64
+ 'crossorigin' => 'anonymous',
65
+ 'loading' => 'lazy',
66
+ 'sandbox' => 'allow-same-origin',
67
+ 'src' => true,
68
+ 'width' => true,
69
+ 'height' => true
70
+ },
71
+
72
+ 'video' => {
73
+ 'referrerpolicy' => 'no-referrer',
74
+ 'crossorigin' => 'anonymous',
75
+ 'preload' => 'none',
76
+ 'playsinline' => 'true',
77
+ 'controls' => 'true'
78
+ },
79
+
80
+ 'audio' => {
81
+ 'referrerpolicy' => 'no-referrer',
82
+ 'crossorigin' => 'anonymous',
83
+ 'preload' => 'none'
84
+ }
85
+ }.freeze
86
+ # @param value [String] extracted selector value
87
+ # @param context [Selectors::Context] post-processor context
88
+ # @return [void]
89
+ def self.validate_args!(value, context)
90
+ assert_type value, String, :value, context:
91
+ end
92
+
93
+ ##
94
+ # Shorthand method to get the sanitized HTML.
95
+ # @param html [String]
96
+ # @param url [String, Html2rss::Url]
97
+ # @return [String, nil]
98
+ def self.get(html, url)
99
+ return nil if String(html).empty?
100
+
101
+ context = Selectors::Context.new(config: { channel: { url: } }, options: {})
102
+ new(html, context).get
103
+ end
104
+
105
+ ##
106
+ # @return [String, nil]
107
+ def get
108
+ sanitized_html = Sanitize.fragment(value, sanitize_config).to_s
109
+ sanitized_html.gsub!(/\s+/, ' ')
110
+ sanitized_html.strip!
111
+ sanitized_html.empty? ? nil : sanitized_html
112
+ end
113
+
114
+ private
115
+
116
+ def channel_url = context.dig(:config, :channel, :url)
117
+
118
+ ##
119
+ # @return [Sanitize::Config]
120
+ def sanitize_config # rubocop:disable Metrics/MethodLength
121
+ config = Sanitize::Config.merge(
122
+ Sanitize::Config::RELAXED,
123
+ attributes: { all: %w[dir lang alt title translate] },
124
+ add_attributes: TAG_ATTRIBUTES,
125
+ transformers: [
126
+ method(:transform_urls_to_absolute_ones),
127
+ method(:wrap_img_in_a)
128
+ ]
129
+ )
130
+ config[:elements].push('audio', 'video', 'source')
131
+ config
132
+ end
133
+
134
+ ##
135
+ # Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
136
+ #
137
+ # @param env [Hash]
138
+ # @return [nil]
139
+ def transform_urls_to_absolute_ones(env)
140
+ HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
141
+ end
142
+
143
+ ##
144
+ # Wrapper for wrap_img_in_a.
145
+ #
146
+ # @param env [Hash]
147
+ # @return [nil]
148
+ def wrap_img_in_a(env)
149
+ HtmlTransformers::WrapImgInA.new.call(**env)
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ ##
7
+ # Returns a defined part of a String.
8
+ #
9
+ # Both parameters must be an Integer and they can be negative.
10
+ # The +end+ parameter can be omitted, in that case it will not cut the
11
+ # String at the end.
12
+ #
13
+ # A Regexp or a MatchString is not supported.
14
+ #
15
+ # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
16
+ # documentation for more information.
17
+ #
18
+ # Imagine this HTML:
19
+ # <h1>Foo bar and baz<h1>
20
+ #
21
+ # YAML usage example:
22
+ # selectors:
23
+ # title:
24
+ # selector: h1
25
+ # post_process:
26
+ # name: substring
27
+ # start: 4
28
+ # end: 6
29
+ #
30
+ # Would return:
31
+ # 'bar'
32
+ class Substring < Base
33
+ # @param value [String] extracted selector value
34
+ # @param context [Selectors::Context] post-processor context
35
+ # @return [void]
36
+ def self.validate_args!(value, context)
37
+ assert_type value, String, :value, context:
38
+
39
+ options = context[:options]
40
+ assert_type options[:start], Integer, :start, context:
41
+
42
+ end_index = options[:end]
43
+ assert_type(end_index, Integer, :end, context:) if end_index
44
+ end
45
+
46
+ ##
47
+ # Extracts the substring from the original string based on the provided start and end indices.
48
+ #
49
+ # @return [String] The extracted substring.
50
+ def get
51
+ value[range]
52
+ end
53
+
54
+ ##
55
+ # Determines the range for the substring extraction based on the provided start and end indices.
56
+ #
57
+ # @return [Range] The range object representing the start and end/Infinity (integers).
58
+ def range
59
+ return (start_index..) unless end_index?
60
+
61
+ if start_index == end_index
62
+ raise ArgumentError,
63
+ 'The `start` value must be unequal to the `end` value.'
64
+ end
65
+
66
+ (start_index..end_index)
67
+ end
68
+
69
+ private
70
+
71
+ def end_index? = !context[:options][:end].to_s.empty?
72
+ def end_index = context[:options][:end].to_i
73
+ def start_index = context[:options][:start].to_i
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ ##
7
+ # Returns a formatted String according to the string pattern.
8
+ # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
9
+ #
10
+ # It supports the format pattern `%<key>s` and `%{key}`, where `key` is the key of the selector.
11
+ # If `%{self}` is used, the selectors extracted value will be used.
12
+ #
13
+ # Imagine this HTML:
14
+ #
15
+ # <li>
16
+ # <h1>Product</h1>
17
+ # <span class="price">23,42€</span>
18
+ # </li>
19
+ #
20
+ #
21
+ # YAML usage example:
22
+ #
23
+ # selectors:
24
+ # items:
25
+ # selector: 'li'
26
+ # price:
27
+ # selector: '.price'
28
+ # title:
29
+ # selector: h1
30
+ # post_process:
31
+ # name: template
32
+ # string: '%{self} (%{price})'
33
+ #
34
+ # Would return:
35
+ # 'Product (23,42€)'
36
+ class Template < Base
37
+ # @param value [String] extracted selector value
38
+ # @param context [Selectors::Context] post-processor context
39
+ # @return [void]
40
+ def self.validate_args!(value, context)
41
+ assert_type value, String, :value, context:
42
+
43
+ string = context[:options]&.dig(:string).to_s
44
+ raise InvalidType, 'The `string` template is absent.' if string.empty?
45
+ end
46
+
47
+ ##
48
+ # @param value [String]
49
+ # @param context [Selectors::Context]
50
+ def initialize(value, context)
51
+ super
52
+
53
+ @options = context[:options] || {}
54
+ @scraper = context[:scraper]
55
+ @item = context[:item]
56
+ @string = @options[:string].to_s
57
+ end
58
+
59
+ ##
60
+ # @return [String]
61
+ def get
62
+ Html2rss::Config::DynamicParams.call(@string, {}, getter: method(:item_value), replace_missing_with: '')
63
+ end
64
+
65
+ private
66
+
67
+ # @param key [String, Symbol]
68
+ # @return [String]
69
+ def item_value(key)
70
+ key = key.to_sym
71
+ key == :self ? value : @scraper.select(key, @item)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ ##
6
+ # Provides a namespace for attribute post processors.
7
+ module PostProcessors
8
+ ##
9
+ # Error raised when an unknown post processor name is requested.
10
+ class UnknownPostProcessorName < Html2rss::Error; end
11
+
12
+ ##
13
+ # Error raised when a required option is missing.
14
+ class MissingOption < Html2rss::Error; end
15
+
16
+ ##
17
+ # Error raised when an invalid type is provided.
18
+ class InvalidType < Html2rss::Error; end
19
+
20
+ ##
21
+ # Maps the post processor name to the class implementing the post processor.
22
+ #
23
+ # The key is the name to use in the feed config.
24
+ NAME_TO_CLASS = {
25
+ gsub: Gsub,
26
+ html_to_markdown: HtmlToMarkdown,
27
+ markdown_to_html: MarkdownToHtml,
28
+ parse_time: ParseTime,
29
+ parse_uri: ParseUri,
30
+ sanitize_html: SanitizeHtml,
31
+ substring: Substring,
32
+ template: Template
33
+ }.freeze
34
+
35
+ ##
36
+ # Shorthand method to instantiate the post processor and call `#get` on it
37
+ #
38
+ # @param name [String, Symbol] post-processor name from selector config
39
+ # @param value [Object] extracted selector value
40
+ # @param context [Selectors::Context] post-processor context
41
+ # @return [Object] transformed selector value
42
+ def self.get(name, value, context)
43
+ klass = NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Unknown name '#{name}'")
44
+ klass.new(value, context).get
45
+ end
46
+ end
47
+ end
48
+ end