html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -1,108 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'sanitize'
4
- require_relative 'html_transformers/transform_urls_to_absolute_ones'
5
- require_relative 'html_transformers/wrap_img_in_a'
6
-
7
- module Html2rss
8
- module AttributePostProcessors
9
- ##
10
- # Returns sanitized HTML code as String.
11
- #
12
- # It sanitizes by using the [sanitize gem](https://github.com/rgrove/sanitize) with
13
- # [Sanitize::Config::RELAXED](https://github.com/rgrove/sanitize#sanitizeconfigrelaxed).
14
- #
15
- # Furthermore, it adds:
16
- #
17
- # - `rel="nofollow noopener noreferrer"` to <a> tags
18
- # - `referrer-policy='no-referrer'` to <img> tags
19
- # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
20
- # linking to the <img>'s `src`.
21
- #
22
- # Imagine this HTML structure:
23
- #
24
- # <section>
25
- # Lorem <b>ipsum</b> dolor...
26
- # <iframe src="https://evil.corp/miner"></iframe>
27
- # <script>alert();</script>
28
- # </section>
29
- #
30
- # YAML usage example:
31
- #
32
- # selectors:
33
- # description:
34
- # selector: '.section'
35
- # extractor: html
36
- # post_process:
37
- # name: sanitize_html
38
- #
39
- # Would return:
40
- # '<p>Lorem <b>ipsum</b> dolor ...</p>'
41
- class SanitizeHtml < Base
42
- def self.validate_args!(value, context)
43
- assert_type value, String, :value, context:
44
- end
45
-
46
- ##
47
- # Shorthand method to get the sanitized HTML.
48
- # @param html [String]
49
- # @param url [String, Addressable::URI]
50
- def self.get(html, url)
51
- raise ArgumentError, 'url must be a String or Addressable::URI' if url.to_s.empty?
52
- return nil if html.to_s.empty?
53
-
54
- new(html, { config: Config::Channel.new({ url: }) }).get
55
- end
56
-
57
- ##
58
- # @return [String]
59
- def get
60
- sanitized_html = Sanitize.fragment(value, sanitize_config)
61
- sanitized_html.to_s.gsub(/\s+/, ' ').strip
62
- end
63
-
64
- private
65
-
66
- ##
67
- # @return [Sanitize::Config]
68
- def sanitize_config
69
- Sanitize::Config.merge(
70
- Sanitize::Config::RELAXED,
71
- attributes: { all: %w[dir lang alt title translate] },
72
- add_attributes:,
73
- transformers: [
74
- method(:transform_urls_to_absolute_ones),
75
- method(:wrap_img_in_a)
76
- ]
77
- )
78
- end
79
-
80
- def add_attributes
81
- {
82
- 'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
83
- 'img' => { 'referrer-policy' => 'no-referrer' }
84
- }
85
- end
86
-
87
- def channel_url = context[:config].url
88
-
89
- ##
90
- # Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
91
- #
92
- # @param env [Hash]
93
- # @return [nil]
94
- def transform_urls_to_absolute_ones(env)
95
- HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
96
- end
97
-
98
- ##
99
- # Wrapper for wrap_img_in_a.
100
- #
101
- # @param env [Hash]
102
- # @return [nil]
103
- def wrap_img_in_a(env)
104
- HtmlTransformers::WrapImgInA.new.call(**env)
105
- end
106
- end
107
- end
108
- end
@@ -1,72 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module AttributePostProcessors
5
- ##
6
- # Returns a defined part of a String.
7
- #
8
- # Both parameters must be an Integer and they can be negative.
9
- # The +end+ parameter can be omitted, in that case it will not cut the
10
- # String at the end.
11
- #
12
- # A Regexp or a MatchString is not supported.
13
- #
14
- # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
15
- # documentation for more information.
16
- #
17
- # Imagine this HTML:
18
- # <h1>Foo bar and baz<h1>
19
- #
20
- # YAML usage example:
21
- # selectors:
22
- # title:
23
- # selector: h1
24
- # post_process:
25
- # name: substring
26
- # start: 4
27
- # end: 6
28
- #
29
- # Would return:
30
- # 'bar'
31
- class Substring < Base
32
- def self.validate_args!(value, context)
33
- assert_type value, String, :value, context:
34
-
35
- options = context[:options]
36
- assert_type options[:start], Integer, :start, context:
37
-
38
- end_index = options[:end]
39
- assert_type(end_index, Integer, :end, context:) if end_index
40
- end
41
-
42
- ##
43
- # Extracts the substring from the original string based on the provided start and end indices.
44
- #
45
- # @return [String] The extracted substring.
46
- def get
47
- value[range]
48
- end
49
-
50
- ##
51
- # Determines the range for the substring extraction based on the provided start and end indices.
52
- #
53
- # @return [Range] The range object representing the start and end/Infinity (integers).
54
- def range
55
- return (start_index..) unless end_index?
56
-
57
- if start_index == end_index
58
- raise ArgumentError,
59
- 'The `start` value must be unequal to the `end` value.'
60
- end
61
-
62
- (start_index..end_index)
63
- end
64
-
65
- private
66
-
67
- def end_index? = !context[:options][:end].to_s.empty?
68
- def end_index = context[:options][:end].to_i
69
- def start_index = context[:options][:start].to_i
70
- end
71
- end
72
- end
@@ -1,101 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module AttributePostProcessors
5
- ##
6
- # Returns a formatted String according to the string pattern.
7
- #
8
- # If +self+ is used, the selectors extracted value will be used.
9
- # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
10
- #
11
- # Imagine this HTML:
12
- #
13
- # <li>
14
- # <h1>Product</h1>
15
- # <span class="price">23,42€</span>
16
- # </li>
17
- #
18
- #
19
- # YAML usage example:
20
- #
21
- # selectors:
22
- # items:
23
- # selector: 'li'
24
- # price:
25
- # selector: '.price'
26
- # title:
27
- # selector: h1
28
- # post_process:
29
- # name: template
30
- # string: '%{self} (%{price})'
31
- #
32
- # Would return:
33
- # 'Product (23,42€)'
34
- class Template < Base
35
- def self.validate_args!(value, context)
36
- assert_type value, String, :value, context:
37
-
38
- string = context[:options]&.dig(:string).to_s
39
- raise InvalidType, 'The `string` template is absent.' if string.empty?
40
- end
41
-
42
- ##
43
- # @param value [String]
44
- # @param context [Item::Context]
45
- def initialize(value, context)
46
- super
47
-
48
- @options = context[:options] || {}
49
- @item = context[:item]
50
- @string = @options[:string].to_s
51
- end
52
-
53
- ##
54
- # @return [String]
55
- def get
56
- @options[:methods] ? format_string_with_methods : format_string_with_dynamic_params
57
- end
58
-
59
- private
60
-
61
- ##
62
- # @return [String] the string containing the template
63
- attr_reader :string
64
-
65
- ##
66
- # @return [Array<String>]
67
- def methods
68
- @methods ||= @options[:methods].map { |method_name| item_value(method_name) }
69
- end
70
-
71
- ##
72
- # Formats a string using methods.
73
- #
74
- # @return [String]
75
- # @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
76
- def format_string_with_methods
77
- Log.warn '[DEPRECATION] This method of using params is deprecated and \
78
- support for it will be removed in version 1.0.0.\
79
- Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
80
-
81
- string % methods
82
- end
83
-
84
- ##
85
- # @return [String]
86
- def format_string_with_dynamic_params
87
- param_names = string.scan(/%[<|{](\w*)[>|}]/)
88
- param_names.flatten!
89
-
90
- format(string, param_names.to_h { |name| [name.to_sym, item_value(name)] })
91
- end
92
-
93
- ##
94
- # @param method_name [String, Symbol]
95
- # @return [String]
96
- def item_value(method_name)
97
- method_name.to_sym == :self ? value : @item.public_send(method_name).to_s
98
- end
99
- end
100
- end
101
- end
@@ -1,44 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- ##
5
- # Provides a namespace for attribute post processors.
6
- module AttributePostProcessors
7
- ##
8
- # Error raised when an unknown post processor name is requested.
9
- class UnknownPostProcessorName < Html2rss::Error; end
10
-
11
- ##
12
- # Error raised when a required option is missing.
13
- class MissingOption < Html2rss::Error; end
14
-
15
- ##
16
- # Error raised when an invalid type is provided.
17
- class InvalidType < Html2rss::Error; end
18
-
19
- ##
20
- # Maps the post processor name to the class implementing the post processor.
21
- #
22
- # The key is the name to use in the feed config.
23
- NAME_TO_CLASS = {
24
- gsub: Gsub,
25
- html_to_markdown: HtmlToMarkdown,
26
- markdown_to_html: MarkdownToHtml,
27
- parse_time: ParseTime,
28
- parse_uri: ParseUri,
29
- sanitize_html: SanitizeHtml,
30
- substring: Substring,
31
- template: Template
32
- }.freeze
33
-
34
- ##
35
- # Retrieves the attribute post processor class based on the given name.
36
- #
37
- # @param name [Symbol] The name of the post processor.
38
- # @return [Class] The attribute post processor class.
39
- # @raise [UnknownPostProcessorName] If the requested name is not found in NAME_TO_CLASS.
40
- def self.get_processor(name)
41
- NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Can't find a post processor named '#{name}'")
42
- end
43
- end
44
- end
@@ -1,127 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'zlib'
4
- require 'sanitize'
5
- require 'nokogiri'
6
-
7
- module Html2rss
8
- class AutoSource
9
- ##
10
- # Article is a simple data object representing an article extracted from a page.
11
- # It is enumerable and responds to all keys specified in PROVIDED_KEYS.
12
- class Article
13
- include Enumerable
14
- include Comparable
15
-
16
- PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
17
-
18
- ##
19
- # Removes the specified pattern from the beginning of the text
20
- # within a given range if the pattern occurs before the range's end.
21
- #
22
- # @param text [String]
23
- # @param pattern [String]
24
- # @param end_of_range [Integer] - Optional, defaults to half the size of the text
25
- # @return [String]
26
- def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
27
- return text unless text.is_a?(String) && pattern.is_a?(String)
28
-
29
- index = text.index(pattern)
30
- return text if index.nil? || index >= end_of_range
31
-
32
- text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
33
- end
34
-
35
- ##
36
- # Checks if the text contains HTML tags.
37
- # @param text [String]
38
- # @return [Boolean]
39
- def self.contains_html?(text)
40
- Nokogiri::HTML.fragment(text).children.any?(&:element?)
41
- end
42
-
43
- # @param options [Hash<Symbol, String>]
44
- def initialize(**options)
45
- @to_h = {}
46
- options.each_pair { |key, value| @to_h[key] = value.freeze if value }
47
- @to_h.freeze
48
-
49
- return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
50
-
51
- Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
52
- end
53
-
54
- # Checks if the article is valid based on the presence of URL, ID, and either title or description.
55
- # @return [Boolean] True if the article is valid, otherwise false.
56
- def valid?
57
- !url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
58
- end
59
-
60
- # @yield [key, value]
61
- # @return [Enumerator] if no block is given
62
- def each
63
- return enum_for(:each) unless block_given?
64
-
65
- PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
66
- end
67
-
68
- def id
69
- @to_h[:id]
70
- end
71
-
72
- def title
73
- @to_h[:title]
74
- end
75
-
76
- def description
77
- return @description if defined?(@description)
78
-
79
- return if (description = @to_h[:description]).to_s.empty?
80
-
81
- @description = self.class.remove_pattern_from_start(description, title) if title
82
-
83
- if self.class.contains_html?(@description) && url
84
- @description = Html2rss::AttributePostProcessors::SanitizeHtml.get(description, url)
85
- else
86
- @description
87
- end
88
- end
89
-
90
- # @return [Addressable::URI, nil]
91
- def url
92
- @url ||= Html2rss::Utils.sanitize_url(@to_h[:url])
93
- end
94
-
95
- # @return [Addressable::URI, nil]
96
- def image
97
- @image ||= Html2rss::Utils.sanitize_url(@to_h[:image])
98
- end
99
-
100
- # Generates a unique identifier based on the URL and ID using CRC32.
101
- # @return [String]
102
- def guid
103
- @guid ||= Zlib.crc32([url, id].join('#!/')).to_s(36).encode('utf-8')
104
- end
105
-
106
- # Parses and returns the published_at time.
107
- # @return [DateTime, nil]
108
- def published_at
109
- return if (string = @to_h[:published_at].to_s.strip).empty?
110
-
111
- @published_at ||= DateTime.parse(string)
112
- rescue ArgumentError
113
- nil
114
- end
115
-
116
- def scraper
117
- @to_h[:scraper]
118
- end
119
-
120
- def <=>(other)
121
- return nil unless other.is_a?(Article)
122
-
123
- 0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
124
- end
125
- end
126
- end
127
- end
@@ -1,78 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- class AutoSource
5
- ##
6
- # Extracts channel information from
7
- # 1. the HTML document's <head>.
8
- # 2. the HTTP response
9
- class Channel
10
- ##
11
- #
12
- # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
13
- # @param url [Addressable::URI] The URL of the channel.
14
- # @param headers [Hash<String, String>] the http headers
15
- # @param articles [Array<Html2rss::AutoSource::Article>] The articles.
16
- def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
17
- @parsed_body = parsed_body
18
- @url = url
19
- @headers = headers
20
- @articles = articles
21
- @stylesheets = stylesheets
22
- end
23
-
24
- attr_writer :articles
25
- attr_reader :stylesheets
26
-
27
- def url = @url.normalize.to_s
28
-
29
- def title
30
- @title ||= if (title = parsed_body.at_css('head > title')&.text.to_s) && !title.empty?
31
- title.gsub(/\s+/, ' ').strip
32
- else
33
- Utils.titleized_channel_url(@url)
34
- end
35
- end
36
-
37
- def description = parsed_body.at_css('meta[name="description"]')&.[]('content')
38
- def last_build_date = headers['last-modified']
39
-
40
- def language
41
- return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
42
-
43
- parsed_body.at_css('[lang]')&.[]('lang')
44
- end
45
-
46
- def image
47
- url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
48
- Html2rss::Utils.sanitize_url(url) if url
49
- end
50
-
51
- def ttl
52
- ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
53
- return unless ttl
54
-
55
- ttl.to_i.fdiv(60).ceil
56
- end
57
-
58
- def generator
59
- "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
60
- end
61
-
62
- private
63
-
64
- attr_reader :parsed_body, :headers
65
-
66
- def scraper_counts
67
- scraper_counts = +''
68
-
69
- @articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 }
70
- .each do |klass, count|
71
- scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]")
72
- end
73
-
74
- scraper_counts
75
- end
76
- end
77
- end
78
- end
@@ -1,48 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- class AutoSource
5
- ##
6
- # Reducer is responsible for reducing the list of articles.
7
- # It keeps only the longest attributes of articles with the same URL.
8
- # It also filters out invalid articles.
9
- class Reducer
10
- class << self
11
- def call(articles, **_options)
12
- Log.debug "Reducer: inited with #{articles.size} articles"
13
-
14
- reduce_by_keeping_longest_values(articles, keep: [:scraper]) { |article| article.url&.path }
15
- end
16
-
17
- private
18
-
19
- # @param articles [Array<Article>]
20
- # @return [Array<Article>] reduced articles
21
- def reduce_by_keeping_longest_values(articles, keep:, &)
22
- grouped_by_block = articles.group_by(&)
23
- grouped_by_block.each_with_object([]) do |(_key, grouped_articles), result|
24
- memo_object = {}
25
- grouped_articles.each do |article_hash|
26
- keep_longest_values(memo_object, article_hash, keep:)
27
- end
28
-
29
- result << Article.new(**memo_object)
30
- end
31
- end
32
-
33
- def keep_longest_values(memo_object, article_hash, keep:)
34
- article_hash.each do |key, value|
35
- next if value.eql?(memo_object[key])
36
-
37
- if keep.include?(key)
38
- memo_object[key] ||= []
39
- memo_object[key] << value
40
- elsif value && value.to_s.size > memo_object[key].to_s.size
41
- memo_object[key] = value
42
- end
43
- end
44
- end
45
- end
46
- end
47
- end
48
- end
@@ -1,70 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rss'
4
-
5
- module Html2rss
6
- class AutoSource
7
- ##
8
- # Converts the autosourced channel and articles to an RSS feed.
9
- class RssBuilder
10
- def self.add_guid(article, maker)
11
- maker.guid.tap do |guid|
12
- guid.content = article.guid
13
- guid.isPermaLink = false
14
- end
15
- end
16
-
17
- def self.add_image(article, maker)
18
- url = article.image || return
19
-
20
- maker.enclosure.tap do |enclosure|
21
- enclosure.url = url
22
- enclosure.type = Html2rss::Utils.guess_content_type_from_url(url)
23
- enclosure.length = 0
24
- end
25
- end
26
-
27
- def initialize(channel:, articles:)
28
- @channel = channel
29
- @articles = articles
30
- end
31
-
32
- def call
33
- RSS::Maker.make('2.0') do |maker|
34
- Html2rss::RssBuilder::Stylesheet.add(maker, channel.stylesheets)
35
-
36
- make_channel(maker.channel)
37
- make_items(maker)
38
- end
39
- end
40
-
41
- private
42
-
43
- attr_reader :channel, :articles
44
-
45
- def make_channel(maker)
46
- %i[language title description ttl].each do |key|
47
- maker.public_send(:"#{key}=", channel.public_send(key))
48
- end
49
-
50
- maker.link = channel.url
51
- maker.generator = channel.generator
52
- maker.updated = channel.last_build_date
53
- end
54
-
55
- def make_items(maker)
56
- articles.each do |article|
57
- maker.items.new_item do |item_maker|
58
- RssBuilder.add_guid(article, item_maker)
59
- RssBuilder.add_image(article, item_maker)
60
-
61
- item_maker.title = article.title
62
- item_maker.description = article.description
63
- item_maker.pubDate = article.published_at&.rfc2822
64
- item_maker.link = article.url
65
- end
66
- end
67
- end
68
- end
69
- end
70
- end