html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zlib'
4
+ require 'sanitize'
5
+ require 'nokogiri'
6
+
7
+ module Html2rss
8
+ class RssBuilder
9
+ ##
10
+ # Article is a simple data object representing an article extracted from a page.
11
+ # It is enumerable and responds to all keys specified in PROVIDED_KEYS.
12
+ class Article
13
+ include Enumerable
14
+ include Comparable
15
+
16
+ # Allowed article attributes accepted by the value object constructor.
17
+ PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
18
+ # Separator used to build deterministic deduplication fingerprints.
19
+ DEDUP_FINGERPRINT_SEPARATOR = '#!/'
20
+
21
+ # @param options [Hash{Symbol => String}]
22
+ # @option options [String] :id stable article identifier
23
+ # @option options [String] :title article title
24
+ # @option options [String] :description article description/content
25
+ # @option options [String, Html2rss::Url] :url canonical article URL
26
+ # @option options [String, Html2rss::Url] :image image URL for fallback enclosure rendering
27
+ # @option options [String] :author author name
28
+ # @option options [String] :guid explicit GUID override
29
+ # @option options [String, Time, DateTime] :published_at publication timestamp
30
+ # @option options [Array<Hash{Symbol => Object}>] :enclosures enclosure attribute hashes
31
+ # @option options [Array<String>] :categories category labels
32
+ # @option options [Class] :scraper scraper class that produced the article
33
+ def initialize(**options)
34
+ @to_h = {}
35
+ options.each_pair { |key, value| @to_h[key] = value.freeze if value }
36
+ @to_h.freeze
37
+
38
+ return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
39
+
40
+ Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
41
+ end
42
+
43
+ # Checks if the article is valid based on the presence of URL, ID, and either title or description.
44
+ # @return [Boolean] True if the article is valid, otherwise false.
45
+ def valid?
46
+ !url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
47
+ end
48
+
49
+ # @yield [key, value]
50
+ # @return [Enumerator] if no block is given
51
+ def each
52
+ return enum_for(:each) unless block_given?
53
+
54
+ PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
55
+ end
56
+
57
+ # @return [String, nil] stable article identifier
58
+ def id = blank_string_to_nil(@to_h[:id])
59
+
60
+ # @return [String, nil] article title
61
+ def title = blank_string_to_nil(@to_h[:title])
62
+
63
+ # @return [String] rendered article description
64
+ def description
65
+ @description ||= Rendering::DescriptionBuilder.new(
66
+ base: @to_h[:description],
67
+ title:,
68
+ url:,
69
+ enclosures:,
70
+ image:
71
+ ).call
72
+ end
73
+
74
+ # @return [Url, nil]
75
+ def url
76
+ @url ||= Url.sanitize(@to_h[:url])
77
+ end
78
+
79
+ # @return [Url, nil]
80
+ def image
81
+ @image ||= Url.sanitize(@to_h[:image])
82
+ end
83
+
84
+ # @return [String, nil]
85
+ def author = blank_string_to_nil(@to_h[:author])
86
+
87
+ # Generates a unique identifier based on the URL and ID using CRC32.
88
+ # @return [String]
89
+ def guid
90
+ @guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
91
+ end
92
+
93
+ ##
94
+ # Returns a deterministic fingerprint used to detect duplicate articles.
95
+ #
96
+ # @return [String, Integer]
97
+ def deduplication_fingerprint
98
+ dedup_from_url || dedup_from_id || dedup_from_guid || hash
99
+ end
100
+
101
+ # @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
102
+ def enclosures
103
+ @enclosures ||= Array(@to_h[:enclosures])
104
+ .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
105
+ end
106
+
107
+ # @return [Html2rss::RssBuilder::Enclosure, nil]
108
+ def enclosure
109
+ return @enclosure if defined?(@enclosure)
110
+
111
+ case (object = @to_h[:enclosures]&.first)
112
+ when Hash
113
+ @enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
114
+ when nil
115
+ @enclosure = Html2rss::RssBuilder::Enclosure.new(url: image) if image
116
+ else
117
+ Log.warn "Article: unknown enclosure type: #{object.class}"
118
+ end
119
+ end
120
+
121
+ # @return [Array<String>] normalized, unique category names
122
+ def categories
123
+ @categories ||= @to_h[:categories].dup.to_a.tap do |categories|
124
+ categories.map! { |category| category.to_s.strip }
125
+ categories.reject!(&:empty?)
126
+ categories.uniq!
127
+ end
128
+ end
129
+
130
+ # Parses and returns the published_at time.
131
+ # @return [DateTime, nil]
132
+ def published_at
133
+ return if (string = @to_h[:published_at].to_s.strip).empty?
134
+
135
+ @published_at ||= DateTime.parse(string)
136
+ rescue ArgumentError
137
+ nil
138
+ end
139
+
140
+ # @return [Class, nil] scraper class that produced this article
141
+ def scraper
142
+ @to_h[:scraper]
143
+ end
144
+
145
+ # @param other [Object] value compared against this article
146
+ # @return [Integer, nil] comparison result for compatible Article values
147
+ def <=>(other)
148
+ return nil unless other.is_a?(Article)
149
+
150
+ 0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
151
+ end
152
+
153
+ private
154
+
155
+ def dedup_from_url
156
+ return unless (value = url)
157
+
158
+ [value.to_s, id].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
159
+ end
160
+
161
+ def dedup_from_id
162
+ return if id.to_s.empty?
163
+
164
+ id
165
+ end
166
+
167
+ def dedup_from_guid
168
+ value = guid
169
+ return if value.to_s.empty?
170
+
171
+ [value, title, description].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
172
+ end
173
+
174
+ def fetch_guid
175
+ guid = @to_h[:guid].map { |s| s.to_s.strip }.reject(&:empty?).join if @to_h[:guid].is_a?(Array)
176
+
177
+ guid || [url, id].join('#!/')
178
+ end
179
+
180
+ def blank_string_to_nil(value)
181
+ return if value.is_a?(String) && value.strip.empty?
182
+
183
+ value
184
+ end
185
+ end
186
+ end
187
+ end
@@ -1,20 +1,114 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Html2rss
4
- module RssBuilder
4
+ class RssBuilder
5
5
  ##
6
- # Builds the <channel> tag (with the provided maker).
6
+ # Extracts channel information from
7
+ # 1. the HTML document's <head>.
8
+ # 2. the HTTP response
7
9
  class Channel
10
+ # Fallback RSS ttl (in minutes) when no cache directives are present.
11
+ DEFAULT_TTL_IN_MINUTES = 360
12
+ # Description template used when no explicit or discovered description exists.
13
+ DEFAULT_DESCRIPTION_TEMPLATE = 'Latest items from %<url>s'
14
+
8
15
  ##
9
- # @param maker [RSS::Maker::RSS20::Channel]
10
- # @param config [Html2rss::Config]
11
- # @param tags [Set<Symbol>]
12
- # @return nil
13
- def self.add(maker, config, tags)
14
- tags.each { |tag| maker.public_send(:"#{tag}=", config.public_send(tag)) }
15
-
16
- maker.generator = "html2rss V. #{::Html2rss::VERSION}"
17
- maker.lastBuildDate = Time.now
16
+ # @param response [Html2rss::RequestService::Response]
17
+ # @param overrides [Hash{Symbol => String}] optional overrides for channel attributes
18
+ def initialize(response, overrides: {})
19
+ @response = response
20
+ @overrides = overrides
21
+ end
22
+
23
+ # @return [String] channel title derived from overrides, document title, or URL
24
+ def title
25
+ @title ||= fetch_title
26
+ end
27
+
28
+ # @return [Html2rss::Url] canonical channel URL
29
+ def url = @url ||= Html2rss::Url.from_absolute(@response.url)
30
+
31
+ # @return [String] channel description text
32
+ def description
33
+ return overrides[:description] unless overrides[:description].to_s.empty?
34
+
35
+ description = parsed_body.at_css('meta[name="description"]')&.[]('content') if html_response?
36
+
37
+ return format(DEFAULT_DESCRIPTION_TEMPLATE, url:) if description.to_s.empty?
38
+
39
+ description
40
+ end
41
+
42
+ # @return [Integer] cache time-to-live in minutes
43
+ def ttl
44
+ return overrides[:ttl] if overrides[:ttl]
45
+
46
+ if (ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1))
47
+ return ttl.to_i.fdiv(60).ceil
48
+ end
49
+
50
+ DEFAULT_TTL_IN_MINUTES
51
+ end
52
+
53
+ # @return [String, nil] ISO-like language code when available
54
+ def language
55
+ return overrides[:language] if overrides[:language]
56
+
57
+ if (language_code = headers['content-language']&.match(/^([a-z]{2})/))
58
+ return language_code[0]
59
+ end
60
+
61
+ return unless html_response?
62
+
63
+ parsed_body['lang'] || parsed_body.at_css('[lang]')&.[]('lang')
64
+ end
65
+
66
+ # @return [String, nil] channel author metadata
67
+ def author
68
+ return overrides[:author] if overrides[:author]
69
+
70
+ return unless html_response?
71
+
72
+ parsed_body.at_css('meta[name="author"]')&.[]('content')
73
+ end
74
+
75
+ # @return [String, Time] source last-modified timestamp or current time fallback
76
+ def last_build_date = headers['last-modified'] || Time.now
77
+
78
+ # @return [Html2rss::Url, nil] channel image URL
79
+ def image
80
+ return overrides[:image] if overrides[:image]
81
+
82
+ return unless html_response?
83
+
84
+ if (image_url = parsed_body.at_css('meta[property="og:image"]')&.[]('content'))
85
+ Url.sanitize(image_url)
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ attr_reader :overrides
92
+
93
+ def parsed_body = @parsed_body ||= @response.parsed_body
94
+ def headers = @headers ||= @response.headers
95
+ def html_response? = @html_response ||= @response.html_response?
96
+
97
+ def fetch_title
98
+ override_title = overrides[:title]
99
+ return override_title if override_title
100
+ return parsed_title if parsed_title
101
+
102
+ url.channel_titleized
103
+ end
104
+
105
+ def parsed_title
106
+ return unless html_response?
107
+
108
+ title = parsed_body.at_css('head > title')&.text.to_s
109
+ return if title.empty?
110
+
111
+ title.gsub(/\s+/, ' ').strip
18
112
  end
19
113
  end
20
114
  end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mime/types'
4
+
5
+ module Html2rss
6
+ class RssBuilder
7
+ ##
8
+ # Represents an enclosure for an RSS item.
9
+ class Enclosure
10
+ ##
11
+ # Guesses the content type based on the file extension of the URL.
12
+ #
13
+ # @param url [Html2rss::Url]
14
+ # @param default [String] default content type
15
+ # @return [String] guessed content type, or default
16
+ def self.guess_content_type_from_url(url, default: 'application/octet-stream')
17
+ return default unless url
18
+
19
+ url = url.path.split('?').first
20
+
21
+ content_type = MIME::Types.type_for(File.extname(url).delete('.'))
22
+ content_type.first&.to_s || 'application/octet-stream'
23
+ end
24
+
25
+ # @param enclosure [Html2rss::RssBuilder::Enclosure, nil] built enclosure object for the current RSS item
26
+ # @param maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
27
+ # @return [void]
28
+ def self.add(enclosure, maker)
29
+ return unless enclosure
30
+
31
+ maker.enclosure.tap do |enclosure_maker|
32
+ enclosure_maker.url = enclosure.url.to_s
33
+ enclosure_maker.type = enclosure.type
34
+ enclosure_maker.length = enclosure.bits_length
35
+ end
36
+ end
37
+
38
+ # @param url [Html2rss::Url] absolute enclosure URL
39
+ # @param type [String, nil] optional enclosure MIME type
40
+ # @param bits_length [Integer] enclosure byte length (historical name)
41
+ def initialize(url:, type: nil, bits_length: 0)
42
+ raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
43
+
44
+ @url = url
45
+ @type = type
46
+ @bits_length = bits_length
47
+ end
48
+
49
+ # @return [String] explicit MIME type or one inferred from URL extension
50
+ def type = @type || self.class.guess_content_type_from_url(url)
51
+
52
+ # @return [Integer] enclosure length in bytes
53
+ def bytes_length = @bits_length
54
+
55
+ # @return [Integer] enclosure length in bytes (legacy reader name)
56
+ def bits_length = bytes_length
57
+
58
+ # @return [Html2rss::Url] absolute enclosure URL
59
+ attr_reader :url
60
+ end
61
+ end
62
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Html2rss
4
- module RssBuilder
4
+ class RssBuilder
5
5
  ##
6
6
  # Represents a stylesheet.
7
7
  class Stylesheet
@@ -10,7 +10,7 @@ module Html2rss
10
10
  # Adds the stylesheet XML tags to the RSS.
11
11
  #
12
12
  # @param maker [RSS::Maker::RSS20] RSS maker object.
13
- # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
13
+ # @param stylesheets [Array<Html2rss::RssBuilder::Stylesheet>] Array of stylesheet configurations.
14
14
  # @return [nil]
15
15
  def add(maker, stylesheets)
16
16
  stylesheets.each do |stylesheet|
@@ -24,7 +24,7 @@ module Html2rss
24
24
  # Adds a single Stylesheet to the RSS.
25
25
  #
26
26
  # @param maker [RSS::Maker::RSS20] RSS maker object.
27
- # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
27
+ # @param stylesheet [Html2rss::RssBuilder::Stylesheet] Stylesheet configuration.
28
28
  # @return [nil]
29
29
  def add_stylesheet(maker, stylesheet)
30
30
  maker.xml_stylesheets.new_xml_stylesheet do |xss|
@@ -35,8 +35,12 @@ module Html2rss
35
35
  end
36
36
  end
37
37
 
38
- TYPES = ['text/css', 'text/xsl'].freeze
38
+ # Allowed stylesheet MIME types for RSS processing instructions.
39
+ TYPES = ['text/css', 'text/xsl'].to_set.freeze
39
40
 
41
+ # @param href [String] stylesheet URL
42
+ # @param type [String] MIME type (`text/css` or `text/xsl`)
43
+ # @param media [String] media query hint for the stylesheet
40
44
  def initialize(href:, type:, media: 'all')
41
45
  raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
42
46
  raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
@@ -4,93 +4,98 @@ require 'rss'
4
4
 
5
5
  module Html2rss
6
6
  ##
7
- # Builds the RSS 2.0 feed, which consists of the '<channel>' and the '<item>'s
8
- # tags in the RSS.
9
- module RssBuilder
10
- # Possible tags inside a RSS 2.0 <channel> tag.
11
- CHANNEL_TAGS = %i[language author title description link ttl].freeze
12
- # Possible tags inside a RSS 2.0 <item> tag.
13
- ITEM_TAGS = %i[title link description author comments updated].freeze
7
+ # Builds an RSS Feed by providing channel, articles and stylesheets.
8
+ class RssBuilder
9
+ class << self
10
+ # @param article [Html2rss::RssBuilder::Article] source article
11
+ # @param item_maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
12
+ # @return [void]
13
+ def add_item(article, item_maker)
14
+ add_item_string_values(article, item_maker)
15
+ add_item_categories(article, item_maker)
16
+ Enclosure.add(article.enclosure, item_maker)
17
+ add_item_guid(article, item_maker)
18
+ end
14
19
 
15
- ##
16
- # Builds an RSS 2.0 feed based on the provided configuration.
17
- #
18
- # @param config [Html2rss::Config] Configuration object containing feed details.
19
- # @return [RSS::Rss] RSS feed object.
20
- def self.build(config)
21
- RSS::Maker.make('2.0') do |maker|
22
- add_stylesheets(maker, config.stylesheets)
23
- add_channel(maker, config)
24
- add_items(maker, config)
20
+ private
21
+
22
+ def add_item_string_values(article, item_maker)
23
+ %i[title description author].each do |attr|
24
+ next unless (value = article.send(attr))
25
+ next if value.empty?
26
+
27
+ item_maker.send(:"#{attr}=", value)
28
+ end
29
+
30
+ item_maker.link = article.url.to_s if article.url
31
+ item_maker.pubDate = article.published_at&.rfc2822
25
32
  end
26
- end
27
33
 
28
- ##
29
- # Adds stylesheets to the RSS maker.
30
- #
31
- # @param maker [RSS::Maker] RSS maker instance.
32
- # @param stylesheets [Array<String>] Array of stylesheets to add.
33
- def self.add_stylesheets(maker, stylesheets)
34
- Stylesheet.add(maker, stylesheets)
35
- end
34
+ def add_item_categories(article, item_maker)
35
+ article.categories.each { |category| item_maker.categories.new_category.content = category }
36
+ end
36
37
 
37
- ##
38
- # Adds channel information to the RSS maker.
39
- #
40
- # @param maker [RSS::Maker] RSS maker instance.
41
- # @param config [Html2rss::Config] Configuration object containing feed details.
42
- def self.add_channel(maker, config)
43
- channel = maker.channel
44
- CHANNEL_TAGS.each do |tag|
45
- Channel.add(channel, config, [tag])
38
+ def add_item_guid(article, item_maker)
39
+ item_maker.guid.tap do |guid|
40
+ guid.content = article.guid
41
+ guid.isPermaLink = false
42
+ end
46
43
  end
47
44
  end
48
45
 
49
46
  ##
50
- # Adds items to the RSS maker based on configuration.
51
- #
52
- # @param maker [RSS::Maker] RSS maker instance.
53
- # @param config [Html2rss::Config] Configuration object containing feed details.
54
- def self.add_items(maker, config)
55
- item_attributes = extract_item_attributes(config)
56
- items = fetch_items(config)
57
- items.reverse! if config.items_order == :reverse
58
-
59
- items.each do |item|
60
- add_item(maker, item, item_attributes)
47
+ # @param channel [Html2rss::RssBuilder::Channel] The channel information for the RSS feed.
48
+ # @param articles [Array<Html2rss::RssBuilder::Article>] The list of articles to include in the RSS feed.
49
+ # @param stylesheets [Array<Hash>] An optional array of stylesheet configurations.
50
+ def initialize(channel:, articles:, stylesheets: [])
51
+ @channel = channel
52
+ @articles = articles
53
+ @stylesheets = stylesheets
54
+ end
55
+
56
+ # @return [RSS::Rss] RSS 2.0 document instance
57
+ def call
58
+ RSS::Maker.make('2.0') do |maker|
59
+ Stylesheet.add(maker, stylesheets)
60
+
61
+ make_channel(maker.channel)
62
+ make_items(maker)
61
63
  end
62
64
  end
63
65
 
64
- ##
65
- # Adds a single item to the RSS maker.
66
- #
67
- # @param maker [RSS::Maker] RSS maker instance.
68
- # @param item [Html2rss::Item] Item to add.
69
- # @param item_attributes [Array<Symbol>] Array of item attributes.
70
- # @return [nil]
71
- def self.add_item(maker, item, item_attributes)
72
- new_item = maker.items.new_item
73
- Item.add(new_item, item, item_attributes)
66
+ private
67
+
68
+ attr_reader :channel, :articles
69
+
70
+ def stylesheets
71
+ @stylesheets.map { |style| Stylesheet.new(**style) }
74
72
  end
75
73
 
76
- ##
77
- # Extracts item attributes from configuration.
78
- #
79
- # @param config [Html2rss::Config] Configuration object containing feed details.
80
- # @return [Array<Symbol>] Array of item attributes.
81
- def self.extract_item_attributes(config)
82
- config.item_selector_names & ITEM_TAGS
74
+ def make_channel(maker)
75
+ %i[language title description ttl].each do |key|
76
+ maker.public_send(:"#{key}=", channel.public_send(key))
77
+ end
78
+
79
+ maker.link = channel.url.to_s
80
+ maker.generator = generator
81
+ maker.updated = channel.last_build_date
83
82
  end
84
83
 
85
- ##
86
- # Fetches items from the URL specified in configuration.
87
- #
88
- # @param config [Html2rss::Config] Configuration object containing feed details.
89
- # @return [Array<Html2rss::Item>] Array of items.
90
- def self.fetch_items(config)
91
- Html2rss::Item.from_url(config.url, config)
84
+ def make_items(maker)
85
+ articles.each do |article|
86
+ maker.items.new_item { |item_maker| self.class.add_item(article, item_maker) }
87
+ end
92
88
  end
93
89
 
94
- private_class_method :extract_item_attributes, :fetch_items, :add_item
90
+ def generator
91
+ scraper_namespace_regex = /(?<namespace>Html2rss|Scraper)::/
92
+
93
+ scraper_counts = articles.flat_map(&:scraper).tally.map do |klass, count|
94
+ scraper_name = klass.to_s.gsub(scraper_namespace_regex, '')
95
+ "#{scraper_name} (#{count})"
96
+ end
97
+
98
+ "html2rss V. #{Html2rss::VERSION} (scrapers: #{scraper_counts.join(', ')})"
99
+ end
95
100
  end
96
101
  end