html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'zlib'
|
|
4
|
+
require 'sanitize'
|
|
5
|
+
require 'nokogiri'
|
|
6
|
+
|
|
7
|
+
module Html2rss
|
|
8
|
+
class RssBuilder
|
|
9
|
+
##
|
|
10
|
+
# Article is a simple data object representing an article extracted from a page.
|
|
11
|
+
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
|
12
|
+
class Article
|
|
13
|
+
include Enumerable
|
|
14
|
+
include Comparable
|
|
15
|
+
|
|
16
|
+
# Allowed article attributes accepted by the value object constructor.
|
|
17
|
+
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
|
|
18
|
+
# Separator used to build deterministic deduplication fingerprints.
|
|
19
|
+
DEDUP_FINGERPRINT_SEPARATOR = '#!/'
|
|
20
|
+
|
|
21
|
+
# @param options [Hash{Symbol => String}]
|
|
22
|
+
# @option options [String] :id stable article identifier
|
|
23
|
+
# @option options [String] :title article title
|
|
24
|
+
# @option options [String] :description article description/content
|
|
25
|
+
# @option options [String, Html2rss::Url] :url canonical article URL
|
|
26
|
+
# @option options [String, Html2rss::Url] :image image URL for fallback enclosure rendering
|
|
27
|
+
# @option options [String] :author author name
|
|
28
|
+
# @option options [String] :guid explicit GUID override
|
|
29
|
+
# @option options [String, Time, DateTime] :published_at publication timestamp
|
|
30
|
+
# @option options [Array<Hash{Symbol => Object}>] :enclosures enclosure attribute hashes
|
|
31
|
+
# @option options [Array<String>] :categories category labels
|
|
32
|
+
# @option options [Class] :scraper scraper class that produced the article
|
|
33
|
+
def initialize(**options)
|
|
34
|
+
@to_h = {}
|
|
35
|
+
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
|
36
|
+
@to_h.freeze
|
|
37
|
+
|
|
38
|
+
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
|
39
|
+
|
|
40
|
+
Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Checks if the article is valid based on the presence of URL, ID, and either title or description.
|
|
44
|
+
# @return [Boolean] True if the article is valid, otherwise false.
|
|
45
|
+
def valid?
|
|
46
|
+
!url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @yield [key, value]
|
|
50
|
+
# @return [Enumerator] if no block is given
|
|
51
|
+
def each
|
|
52
|
+
return enum_for(:each) unless block_given?
|
|
53
|
+
|
|
54
|
+
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [String, nil] stable article identifier
|
|
58
|
+
def id = blank_string_to_nil(@to_h[:id])
|
|
59
|
+
|
|
60
|
+
# @return [String, nil] article title
|
|
61
|
+
def title = blank_string_to_nil(@to_h[:title])
|
|
62
|
+
|
|
63
|
+
# @return [String] rendered article description
|
|
64
|
+
def description
|
|
65
|
+
@description ||= Rendering::DescriptionBuilder.new(
|
|
66
|
+
base: @to_h[:description],
|
|
67
|
+
title:,
|
|
68
|
+
url:,
|
|
69
|
+
enclosures:,
|
|
70
|
+
image:
|
|
71
|
+
).call
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# @return [Url, nil]
|
|
75
|
+
def url
|
|
76
|
+
@url ||= Url.sanitize(@to_h[:url])
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# @return [Url, nil]
|
|
80
|
+
def image
|
|
81
|
+
@image ||= Url.sanitize(@to_h[:image])
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# @return [String, nil]
|
|
85
|
+
def author = blank_string_to_nil(@to_h[:author])
|
|
86
|
+
|
|
87
|
+
# Generates a unique identifier based on the URL and ID using CRC32.
|
|
88
|
+
# @return [String]
|
|
89
|
+
def guid
|
|
90
|
+
@guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
##
|
|
94
|
+
# Returns a deterministic fingerprint used to detect duplicate articles.
|
|
95
|
+
#
|
|
96
|
+
# @return [String, Integer]
|
|
97
|
+
def deduplication_fingerprint
|
|
98
|
+
dedup_from_url || dedup_from_id || dedup_from_guid || hash
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
|
|
102
|
+
def enclosures
|
|
103
|
+
@enclosures ||= Array(@to_h[:enclosures])
|
|
104
|
+
.map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# @return [Html2rss::RssBuilder::Enclosure, nil]
|
|
108
|
+
def enclosure
|
|
109
|
+
return @enclosure if defined?(@enclosure)
|
|
110
|
+
|
|
111
|
+
case (object = @to_h[:enclosures]&.first)
|
|
112
|
+
when Hash
|
|
113
|
+
@enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
|
|
114
|
+
when nil
|
|
115
|
+
@enclosure = Html2rss::RssBuilder::Enclosure.new(url: image) if image
|
|
116
|
+
else
|
|
117
|
+
Log.warn "Article: unknown enclosure type: #{object.class}"
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# @return [Array<String>] normalized, unique category names
|
|
122
|
+
def categories
|
|
123
|
+
@categories ||= @to_h[:categories].dup.to_a.tap do |categories|
|
|
124
|
+
categories.map! { |category| category.to_s.strip }
|
|
125
|
+
categories.reject!(&:empty?)
|
|
126
|
+
categories.uniq!
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Parses and returns the published_at time.
|
|
131
|
+
# @return [DateTime, nil]
|
|
132
|
+
def published_at
|
|
133
|
+
return if (string = @to_h[:published_at].to_s.strip).empty?
|
|
134
|
+
|
|
135
|
+
@published_at ||= DateTime.parse(string)
|
|
136
|
+
rescue ArgumentError
|
|
137
|
+
nil
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# @return [Class, nil] scraper class that produced this article
|
|
141
|
+
def scraper
|
|
142
|
+
@to_h[:scraper]
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# @param other [Object] value compared against this article
|
|
146
|
+
# @return [Integer, nil] comparison result for compatible Article values
|
|
147
|
+
def <=>(other)
|
|
148
|
+
return nil unless other.is_a?(Article)
|
|
149
|
+
|
|
150
|
+
0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
private
|
|
154
|
+
|
|
155
|
+
def dedup_from_url
|
|
156
|
+
return unless (value = url)
|
|
157
|
+
|
|
158
|
+
[value.to_s, id].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def dedup_from_id
|
|
162
|
+
return if id.to_s.empty?
|
|
163
|
+
|
|
164
|
+
id
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def dedup_from_guid
|
|
168
|
+
value = guid
|
|
169
|
+
return if value.to_s.empty?
|
|
170
|
+
|
|
171
|
+
[value, title, description].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def fetch_guid
|
|
175
|
+
guid = @to_h[:guid].map { |s| s.to_s.strip }.reject(&:empty?).join if @to_h[:guid].is_a?(Array)
|
|
176
|
+
|
|
177
|
+
guid || [url, id].join('#!/')
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def blank_string_to_nil(value)
|
|
181
|
+
return if value.is_a?(String) && value.strip.empty?
|
|
182
|
+
|
|
183
|
+
value
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
@@ -1,20 +1,114 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Html2rss
|
|
4
|
-
|
|
4
|
+
class RssBuilder
|
|
5
5
|
##
|
|
6
|
-
#
|
|
6
|
+
# Extracts channel information from
|
|
7
|
+
# 1. the HTML document's <head>.
|
|
8
|
+
# 2. the HTTP response
|
|
7
9
|
class Channel
|
|
10
|
+
# Fallback RSS ttl (in minutes) when no cache directives are present.
|
|
11
|
+
DEFAULT_TTL_IN_MINUTES = 360
|
|
12
|
+
# Description template used when no explicit or discovered description exists.
|
|
13
|
+
DEFAULT_DESCRIPTION_TEMPLATE = 'Latest items from %<url>s'
|
|
14
|
+
|
|
8
15
|
##
|
|
9
|
-
# @param
|
|
10
|
-
# @param
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
# @param response [Html2rss::RequestService::Response]
|
|
17
|
+
# @param overrides [Hash{Symbol => String}] optional overrides for channel attributes
|
|
18
|
+
def initialize(response, overrides: {})
|
|
19
|
+
@response = response
|
|
20
|
+
@overrides = overrides
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# @return [String] channel title derived from overrides, document title, or URL
|
|
24
|
+
def title
|
|
25
|
+
@title ||= fetch_title
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @return [Html2rss::Url] canonical channel URL
|
|
29
|
+
def url = @url ||= Html2rss::Url.from_absolute(@response.url)
|
|
30
|
+
|
|
31
|
+
# @return [String] channel description text
|
|
32
|
+
def description
|
|
33
|
+
return overrides[:description] unless overrides[:description].to_s.empty?
|
|
34
|
+
|
|
35
|
+
description = parsed_body.at_css('meta[name="description"]')&.[]('content') if html_response?
|
|
36
|
+
|
|
37
|
+
return format(DEFAULT_DESCRIPTION_TEMPLATE, url:) if description.to_s.empty?
|
|
38
|
+
|
|
39
|
+
description
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @return [Integer] cache time-to-live in minutes
|
|
43
|
+
def ttl
|
|
44
|
+
return overrides[:ttl] if overrides[:ttl]
|
|
45
|
+
|
|
46
|
+
if (ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1))
|
|
47
|
+
return ttl.to_i.fdiv(60).ceil
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
DEFAULT_TTL_IN_MINUTES
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# @return [String, nil] ISO-like language code when available
|
|
54
|
+
def language
|
|
55
|
+
return overrides[:language] if overrides[:language]
|
|
56
|
+
|
|
57
|
+
if (language_code = headers['content-language']&.match(/^([a-z]{2})/))
|
|
58
|
+
return language_code[0]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
return unless html_response?
|
|
62
|
+
|
|
63
|
+
parsed_body['lang'] || parsed_body.at_css('[lang]')&.[]('lang')
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# @return [String, nil] channel author metadata
|
|
67
|
+
def author
|
|
68
|
+
return overrides[:author] if overrides[:author]
|
|
69
|
+
|
|
70
|
+
return unless html_response?
|
|
71
|
+
|
|
72
|
+
parsed_body.at_css('meta[name="author"]')&.[]('content')
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# @return [String, Time] source last-modified timestamp or current time fallback
|
|
76
|
+
def last_build_date = headers['last-modified'] || Time.now
|
|
77
|
+
|
|
78
|
+
# @return [Html2rss::Url, nil] channel image URL
|
|
79
|
+
def image
|
|
80
|
+
return overrides[:image] if overrides[:image]
|
|
81
|
+
|
|
82
|
+
return unless html_response?
|
|
83
|
+
|
|
84
|
+
if (image_url = parsed_body.at_css('meta[property="og:image"]')&.[]('content'))
|
|
85
|
+
Url.sanitize(image_url)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
attr_reader :overrides
|
|
92
|
+
|
|
93
|
+
def parsed_body = @parsed_body ||= @response.parsed_body
|
|
94
|
+
def headers = @headers ||= @response.headers
|
|
95
|
+
def html_response? = @html_response ||= @response.html_response?
|
|
96
|
+
|
|
97
|
+
def fetch_title
|
|
98
|
+
override_title = overrides[:title]
|
|
99
|
+
return override_title if override_title
|
|
100
|
+
return parsed_title if parsed_title
|
|
101
|
+
|
|
102
|
+
url.channel_titleized
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def parsed_title
|
|
106
|
+
return unless html_response?
|
|
107
|
+
|
|
108
|
+
title = parsed_body.at_css('head > title')&.text.to_s
|
|
109
|
+
return if title.empty?
|
|
110
|
+
|
|
111
|
+
title.gsub(/\s+/, ' ').strip
|
|
18
112
|
end
|
|
19
113
|
end
|
|
20
114
|
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mime/types'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class RssBuilder
|
|
7
|
+
##
|
|
8
|
+
# Represents an enclosure for an RSS item.
|
|
9
|
+
class Enclosure
|
|
10
|
+
##
|
|
11
|
+
# Guesses the content type based on the file extension of the URL.
|
|
12
|
+
#
|
|
13
|
+
# @param url [Html2rss::Url]
|
|
14
|
+
# @param default [String] default content type
|
|
15
|
+
# @return [String] guessed content type, or default
|
|
16
|
+
def self.guess_content_type_from_url(url, default: 'application/octet-stream')
|
|
17
|
+
return default unless url
|
|
18
|
+
|
|
19
|
+
url = url.path.split('?').first
|
|
20
|
+
|
|
21
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
|
22
|
+
content_type.first&.to_s || 'application/octet-stream'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @param enclosure [Html2rss::RssBuilder::Enclosure, nil] built enclosure object for the current RSS item
|
|
26
|
+
# @param maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
|
|
27
|
+
# @return [void]
|
|
28
|
+
def self.add(enclosure, maker)
|
|
29
|
+
return unless enclosure
|
|
30
|
+
|
|
31
|
+
maker.enclosure.tap do |enclosure_maker|
|
|
32
|
+
enclosure_maker.url = enclosure.url.to_s
|
|
33
|
+
enclosure_maker.type = enclosure.type
|
|
34
|
+
enclosure_maker.length = enclosure.bits_length
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# @param url [Html2rss::Url] absolute enclosure URL
|
|
39
|
+
# @param type [String, nil] optional enclosure MIME type
|
|
40
|
+
# @param bits_length [Integer] enclosure byte length (historical name)
|
|
41
|
+
def initialize(url:, type: nil, bits_length: 0)
|
|
42
|
+
raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
|
|
43
|
+
|
|
44
|
+
@url = url
|
|
45
|
+
@type = type
|
|
46
|
+
@bits_length = bits_length
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [String] explicit MIME type or one inferred from URL extension
|
|
50
|
+
def type = @type || self.class.guess_content_type_from_url(url)
|
|
51
|
+
|
|
52
|
+
# @return [Integer] enclosure length in bytes
|
|
53
|
+
def bytes_length = @bits_length
|
|
54
|
+
|
|
55
|
+
# @return [Integer] enclosure length in bytes (legacy reader name)
|
|
56
|
+
def bits_length = bytes_length
|
|
57
|
+
|
|
58
|
+
# @return [Html2rss::Url] absolute enclosure URL
|
|
59
|
+
attr_reader :url
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Html2rss
|
|
4
|
-
|
|
4
|
+
class RssBuilder
|
|
5
5
|
##
|
|
6
6
|
# Represents a stylesheet.
|
|
7
7
|
class Stylesheet
|
|
@@ -10,7 +10,7 @@ module Html2rss
|
|
|
10
10
|
# Adds the stylesheet XML tags to the RSS.
|
|
11
11
|
#
|
|
12
12
|
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
|
13
|
-
# @param stylesheets [Array<Html2rss::
|
|
13
|
+
# @param stylesheets [Array<Html2rss::RssBuilder::Stylesheet>] Array of stylesheet configurations.
|
|
14
14
|
# @return [nil]
|
|
15
15
|
def add(maker, stylesheets)
|
|
16
16
|
stylesheets.each do |stylesheet|
|
|
@@ -24,7 +24,7 @@ module Html2rss
|
|
|
24
24
|
# Adds a single Stylesheet to the RSS.
|
|
25
25
|
#
|
|
26
26
|
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
|
27
|
-
# @param stylesheet [Html2rss::
|
|
27
|
+
# @param stylesheet [Html2rss::RssBuilder::Stylesheet] Stylesheet configuration.
|
|
28
28
|
# @return [nil]
|
|
29
29
|
def add_stylesheet(maker, stylesheet)
|
|
30
30
|
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
|
@@ -35,8 +35,12 @@ module Html2rss
|
|
|
35
35
|
end
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
# Allowed stylesheet MIME types for RSS processing instructions.
|
|
39
|
+
TYPES = ['text/css', 'text/xsl'].to_set.freeze
|
|
39
40
|
|
|
41
|
+
# @param href [String] stylesheet URL
|
|
42
|
+
# @param type [String] MIME type (`text/css` or `text/xsl`)
|
|
43
|
+
# @param media [String] media query hint for the stylesheet
|
|
40
44
|
def initialize(href:, type:, media: 'all')
|
|
41
45
|
raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
|
|
42
46
|
raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
|
data/lib/html2rss/rss_builder.rb
CHANGED
|
@@ -4,93 +4,98 @@ require 'rss'
|
|
|
4
4
|
|
|
5
5
|
module Html2rss
|
|
6
6
|
##
|
|
7
|
-
# Builds
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
7
|
+
# Builds an RSS Feed by providing channel, articles and stylesheets.
|
|
8
|
+
class RssBuilder
|
|
9
|
+
class << self
|
|
10
|
+
# @param article [Html2rss::RssBuilder::Article] source article
|
|
11
|
+
# @param item_maker [RSS::Maker::RSS20::ItemsBase::ItemBase] RSS item builder
|
|
12
|
+
# @return [void]
|
|
13
|
+
def add_item(article, item_maker)
|
|
14
|
+
add_item_string_values(article, item_maker)
|
|
15
|
+
add_item_categories(article, item_maker)
|
|
16
|
+
Enclosure.add(article.enclosure, item_maker)
|
|
17
|
+
add_item_guid(article, item_maker)
|
|
18
|
+
end
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def add_item_string_values(article, item_maker)
|
|
23
|
+
%i[title description author].each do |attr|
|
|
24
|
+
next unless (value = article.send(attr))
|
|
25
|
+
next if value.empty?
|
|
26
|
+
|
|
27
|
+
item_maker.send(:"#{attr}=", value)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
item_maker.link = article.url.to_s if article.url
|
|
31
|
+
item_maker.pubDate = article.published_at&.rfc2822
|
|
25
32
|
end
|
|
26
|
-
end
|
|
27
33
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# @param maker [RSS::Maker] RSS maker instance.
|
|
32
|
-
# @param stylesheets [Array<String>] Array of stylesheets to add.
|
|
33
|
-
def self.add_stylesheets(maker, stylesheets)
|
|
34
|
-
Stylesheet.add(maker, stylesheets)
|
|
35
|
-
end
|
|
34
|
+
def add_item_categories(article, item_maker)
|
|
35
|
+
article.categories.each { |category| item_maker.categories.new_category.content = category }
|
|
36
|
+
end
|
|
36
37
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def self.add_channel(maker, config)
|
|
43
|
-
channel = maker.channel
|
|
44
|
-
CHANNEL_TAGS.each do |tag|
|
|
45
|
-
Channel.add(channel, config, [tag])
|
|
38
|
+
def add_item_guid(article, item_maker)
|
|
39
|
+
item_maker.guid.tap do |guid|
|
|
40
|
+
guid.content = article.guid
|
|
41
|
+
guid.isPermaLink = false
|
|
42
|
+
end
|
|
46
43
|
end
|
|
47
44
|
end
|
|
48
45
|
|
|
49
46
|
##
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
# @param
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
47
|
+
# @param channel [Html2rss::RssBuilder::Channel] The channel information for the RSS feed.
|
|
48
|
+
# @param articles [Array<Html2rss::RssBuilder::Article>] The list of articles to include in the RSS feed.
|
|
49
|
+
# @param stylesheets [Array<Hash>] An optional array of stylesheet configurations.
|
|
50
|
+
def initialize(channel:, articles:, stylesheets: [])
|
|
51
|
+
@channel = channel
|
|
52
|
+
@articles = articles
|
|
53
|
+
@stylesheets = stylesheets
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# @return [RSS::Rss] RSS 2.0 document instance
|
|
57
|
+
def call
|
|
58
|
+
RSS::Maker.make('2.0') do |maker|
|
|
59
|
+
Stylesheet.add(maker, stylesheets)
|
|
60
|
+
|
|
61
|
+
make_channel(maker.channel)
|
|
62
|
+
make_items(maker)
|
|
61
63
|
end
|
|
62
64
|
end
|
|
63
65
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
# @return [nil]
|
|
71
|
-
def self.add_item(maker, item, item_attributes)
|
|
72
|
-
new_item = maker.items.new_item
|
|
73
|
-
Item.add(new_item, item, item_attributes)
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
attr_reader :channel, :articles
|
|
69
|
+
|
|
70
|
+
def stylesheets
|
|
71
|
+
@stylesheets.map { |style| Stylesheet.new(**style) }
|
|
74
72
|
end
|
|
75
73
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
74
|
+
def make_channel(maker)
|
|
75
|
+
%i[language title description ttl].each do |key|
|
|
76
|
+
maker.public_send(:"#{key}=", channel.public_send(key))
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
maker.link = channel.url.to_s
|
|
80
|
+
maker.generator = generator
|
|
81
|
+
maker.updated = channel.last_build_date
|
|
83
82
|
end
|
|
84
83
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
# @return [Array<Html2rss::Item>] Array of items.
|
|
90
|
-
def self.fetch_items(config)
|
|
91
|
-
Html2rss::Item.from_url(config.url, config)
|
|
84
|
+
def make_items(maker)
|
|
85
|
+
articles.each do |article|
|
|
86
|
+
maker.items.new_item { |item_maker| self.class.add_item(article, item_maker) }
|
|
87
|
+
end
|
|
92
88
|
end
|
|
93
89
|
|
|
94
|
-
|
|
90
|
+
def generator
|
|
91
|
+
scraper_namespace_regex = /(?<namespace>Html2rss|Scraper)::/
|
|
92
|
+
|
|
93
|
+
scraper_counts = articles.flat_map(&:scraper).tally.map do |klass, count|
|
|
94
|
+
scraper_name = klass.to_s.gsub(scraper_namespace_regex, '')
|
|
95
|
+
"#{scraper_name} (#{count})"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
"html2rss V. #{Html2rss::VERSION} (scrapers: #{scraper_counts.join(', ')})"
|
|
99
|
+
end
|
|
95
100
|
end
|
|
96
101
|
end
|