simple-rss 1.3.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +79 -7
- data/README.md +321 -0
- data/Rakefile +134 -136
- data/lib/simple-rss.rb +541 -154
- data/simple-rss.gemspec +5 -6
- data/test/base/array_tags_test.rb +37 -0
- data/test/base/base_test.rb +76 -77
- data/test/base/empty_tag_test.rb +56 -0
- data/test/base/encoding_test.rb +87 -0
- data/test/base/enumerable_test.rb +101 -0
- data/test/base/feed_attributes_test.rb +26 -0
- data/test/base/fetch_test.rb +117 -0
- data/test/base/hash_xml_serialization_test.rb +142 -0
- data/test/base/item_attributes_test.rb +26 -0
- data/test/base/json_serialization_test.rb +81 -0
- data/test/data/atom_with_entry_attrs.xml +13 -0
- data/test/data/atom_with_feed_attrs.xml +13 -0
- data/test/data/media_rss.xml +465 -0
- data/test/data/rss20_utf8.xml +61 -0
- data/test/data/rss20_with_channel_attrs.xml +13 -0
- data/test/data/rss20_with_item_attrs.xml +13 -0
- data/test/test_helper.rb +10 -3
- metadata +21 -11
- data/README.markdown +0 -47
- data/install.rb +0 -40
data/lib/simple-rss.rb
CHANGED
|
@@ -1,160 +1,547 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
require "cgi"
|
|
4
|
+
require "time"
|
|
3
5
|
|
|
4
6
|
class SimpleRSS
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
7
|
+
# @rbs skip
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
# @rbs!
|
|
11
|
+
# include Enumerable[Hash[Symbol, untyped]]
|
|
12
|
+
|
|
13
|
+
VERSION = "2.1.0".freeze
|
|
14
|
+
|
|
15
|
+
# @rbs @items: Array[Hash[Symbol, untyped]]
|
|
16
|
+
# @rbs @source: String
|
|
17
|
+
# @rbs @options: Hash[Symbol, untyped]
|
|
18
|
+
# @rbs @etag: String?
|
|
19
|
+
# @rbs @last_modified: String?
|
|
20
|
+
|
|
21
|
+
attr_reader :items #: Array[Hash[Symbol, untyped]]
|
|
22
|
+
attr_reader :source #: String
|
|
23
|
+
attr_reader :etag #: String?
|
|
24
|
+
attr_reader :last_modified #: String?
|
|
25
|
+
alias entries items #: Array[Hash[Symbol, untyped]]
|
|
26
|
+
|
|
27
|
+
@@feed_tags = %i[
|
|
28
|
+
id
|
|
29
|
+
title subtitle link
|
|
30
|
+
description
|
|
31
|
+
author webMaster managingEditor contributor
|
|
32
|
+
pubDate lastBuildDate updated dc:date
|
|
33
|
+
generator language docs cloud
|
|
34
|
+
ttl skipHours skipDays
|
|
35
|
+
image logo icon rating
|
|
36
|
+
rights copyright
|
|
37
|
+
textInput feedburner:browserFriendly
|
|
38
|
+
itunes:author itunes:category
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
@@item_tags = %i[
|
|
42
|
+
id
|
|
43
|
+
title link link+alternate link+self link+edit link+replies
|
|
44
|
+
author contributor
|
|
45
|
+
description summary content content:encoded comments
|
|
46
|
+
pubDate published updated expirationDate modified dc:date
|
|
47
|
+
category guid
|
|
48
|
+
trackback:ping trackback:about
|
|
49
|
+
dc:creator dc:title dc:subject dc:rights dc:publisher
|
|
50
|
+
feedburner:origLink
|
|
51
|
+
media:content#url media:content#type media:content#height media:content#width media:content#duration
|
|
52
|
+
media:title media:thumbnail#url media:thumbnail#height media:thumbnail#width
|
|
53
|
+
media:credit media:credit#role
|
|
54
|
+
media:category media:category#scheme
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
# @rbs (untyped, ?Hash[Symbol, untyped]) -> void
|
|
58
|
+
def initialize(source, options = {})
|
|
59
|
+
@source = source.respond_to?(:read) ? source.read.to_s : source.to_s
|
|
60
|
+
@items = [] #: Array[Hash[Symbol, untyped]]
|
|
61
|
+
@options = {} #: Hash[Symbol, untyped]
|
|
62
|
+
@options.update(options)
|
|
63
|
+
|
|
64
|
+
parse
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# @rbs () -> SimpleRSS
|
|
68
|
+
def channel
|
|
69
|
+
self
|
|
70
|
+
end
|
|
71
|
+
alias feed channel
|
|
72
|
+
|
|
73
|
+
# Iterate over all items in the feed
|
|
74
|
+
#
|
|
75
|
+
# @rbs () { (Hash[Symbol, untyped]) -> void } -> self
|
|
76
|
+
# | () -> Enumerator[Hash[Symbol, untyped], self]
|
|
77
|
+
def each(&block)
|
|
78
|
+
return enum_for(:each) unless block
|
|
79
|
+
|
|
80
|
+
items.each(&block)
|
|
81
|
+
self
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Access an item by index
|
|
85
|
+
#
|
|
86
|
+
# @rbs (Integer) -> Hash[Symbol, untyped]?
|
|
87
|
+
def [](index)
|
|
88
|
+
items[index]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Get the n most recent items, sorted by date
|
|
92
|
+
#
|
|
93
|
+
# @rbs (?Integer) -> Array[Hash[Symbol, untyped]]
|
|
94
|
+
def latest(count = 10)
|
|
95
|
+
items.sort_by { |item| item[:pubDate] || item[:updated] || Time.at(0) }.reverse.first(count)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# @rbs (?Hash[Symbol, untyped]) -> Hash[Symbol, untyped]
|
|
99
|
+
def as_json(_options = {})
|
|
100
|
+
hash = {} #: Hash[Symbol, untyped]
|
|
101
|
+
|
|
102
|
+
@@feed_tags.each do |tag|
|
|
103
|
+
tag_cleaned = clean_tag(tag)
|
|
104
|
+
value = instance_variable_get("@#{tag_cleaned}")
|
|
105
|
+
hash[tag_cleaned] = serialize_value(value) if value
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
hash[:items] = items.map do |item|
|
|
109
|
+
item.transform_values { |v| serialize_value(v) }
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
hash
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# @rbs (*untyped) -> String
|
|
116
|
+
def to_json(*)
|
|
117
|
+
require "json"
|
|
118
|
+
JSON.generate(as_json)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
alias to_hash as_json
|
|
122
|
+
|
|
123
|
+
# @rbs (?format: Symbol) -> String
|
|
124
|
+
def to_xml(format: :rss2)
|
|
125
|
+
case format
|
|
126
|
+
when :rss2 then to_rss2_xml
|
|
127
|
+
when :atom then to_atom_xml
|
|
128
|
+
else raise ArgumentError, "Unknown format: #{format}. Supported: :rss2, :atom"
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
class << self
|
|
133
|
+
# @rbs () -> Array[Symbol]
|
|
134
|
+
def feed_tags
|
|
135
|
+
@@feed_tags
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# @rbs (Array[Symbol]) -> Array[Symbol]
|
|
139
|
+
def feed_tags=(ft)
|
|
140
|
+
@@feed_tags = ft
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# @rbs () -> Array[Symbol]
|
|
144
|
+
def item_tags
|
|
145
|
+
@@item_tags
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# @rbs (Array[Symbol]) -> Array[Symbol]
|
|
149
|
+
def item_tags=(it)
|
|
150
|
+
@@item_tags = it
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# The strict attribute is for compatibility with Ruby's standard RSS parser
|
|
154
|
+
#
|
|
155
|
+
# @rbs (untyped, ?Hash[Symbol, untyped]) -> SimpleRSS
|
|
156
|
+
def parse(source, options = {})
|
|
157
|
+
new source, options
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Fetch and parse a feed from a URL
|
|
161
|
+
# Returns nil if conditional GET returns 304 Not Modified
|
|
162
|
+
#
|
|
163
|
+
# @rbs (String, ?Hash[Symbol, untyped]) -> SimpleRSS?
|
|
164
|
+
def fetch(url, options = {})
|
|
165
|
+
require "net/http"
|
|
166
|
+
require "uri"
|
|
167
|
+
|
|
168
|
+
uri = URI.parse(url)
|
|
169
|
+
response = perform_fetch(uri, options)
|
|
170
|
+
|
|
171
|
+
return nil if response.is_a?(Net::HTTPNotModified)
|
|
172
|
+
|
|
173
|
+
raise SimpleRSSError, "HTTP #{response.code}: #{response.message}" unless response.is_a?(Net::HTTPSuccess)
|
|
174
|
+
|
|
175
|
+
body = response.body.force_encoding(Encoding::UTF_8)
|
|
176
|
+
feed = parse(body, options)
|
|
177
|
+
feed.instance_variable_set(:@etag, response["ETag"])
|
|
178
|
+
feed.instance_variable_set(:@last_modified, response["Last-Modified"])
|
|
179
|
+
feed
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
private
|
|
183
|
+
|
|
184
|
+
# @rbs (untyped, Hash[Symbol, untyped]) -> untyped
|
|
185
|
+
def perform_fetch(uri, options)
|
|
186
|
+
http = build_http(uri, options)
|
|
187
|
+
request = build_request(uri, options)
|
|
188
|
+
|
|
189
|
+
response = http.request(request)
|
|
190
|
+
handle_redirect(response, options) || response
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# @rbs (untyped, Hash[Symbol, untyped]) -> untyped
|
|
194
|
+
def build_http(uri, options)
|
|
195
|
+
host = uri.host || raise(SimpleRSSError, "Invalid URL: missing host")
|
|
196
|
+
http = Net::HTTP.new(host, uri.port)
|
|
197
|
+
http.use_ssl = uri.scheme == "https"
|
|
198
|
+
|
|
199
|
+
timeout = options[:timeout]
|
|
200
|
+
if timeout
|
|
201
|
+
http.open_timeout = timeout
|
|
202
|
+
http.read_timeout = timeout
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
http
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# @rbs (untyped, Hash[Symbol, untyped]) -> untyped
|
|
209
|
+
def build_request(uri, options)
|
|
210
|
+
request = Net::HTTP::Get.new(uri)
|
|
211
|
+
request["User-Agent"] = "SimpleRSS/#{VERSION}"
|
|
212
|
+
|
|
213
|
+
# Conditional GET headers
|
|
214
|
+
request["If-None-Match"] = options[:etag] if options[:etag]
|
|
215
|
+
request["If-Modified-Since"] = options[:last_modified] if options[:last_modified]
|
|
216
|
+
|
|
217
|
+
# Custom headers
|
|
218
|
+
options[:headers]&.each { |key, value| request[key] = value }
|
|
219
|
+
|
|
220
|
+
request
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# @rbs (untyped, Hash[Symbol, untyped]) -> untyped
|
|
224
|
+
def handle_redirect(response, options)
|
|
225
|
+
return nil unless response.is_a?(Net::HTTPRedirection)
|
|
226
|
+
return nil if options[:follow_redirects] == false
|
|
227
|
+
|
|
228
|
+
location = response["Location"]
|
|
229
|
+
return nil unless location
|
|
230
|
+
|
|
231
|
+
redirects = (options[:_redirects] || 0) + 1
|
|
232
|
+
raise SimpleRSSError, "Too many redirects" if redirects > 5
|
|
233
|
+
|
|
234
|
+
new_options = options.merge(_redirects: redirects)
|
|
235
|
+
perform_fetch(URI.parse(location), new_options)
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
DATE_TAGS = %i[pubDate lastBuildDate published updated expirationDate modified dc:date].freeze
|
|
240
|
+
STRIP_HTML_TAGS = %i[author contributor skipHours skipDays].freeze
|
|
241
|
+
|
|
242
|
+
private
|
|
243
|
+
|
|
244
|
+
# @rbs () -> void
|
|
245
|
+
def parse
|
|
246
|
+
raise SimpleRSSError, "Poorly formatted feed" unless @source =~ %r{<(channel|feed).*?>.*?</(channel|feed)>}mi
|
|
247
|
+
|
|
248
|
+
# Feed's title and link
|
|
249
|
+
feed_content = Regexp.last_match(1) if @source =~ %r{(.*?)<(rss:|atom:)?(item|entry).*?>.*?</(rss:|atom:)?(item|entry)>}mi
|
|
250
|
+
|
|
251
|
+
# Capture channel/feed tag attributes
|
|
252
|
+
feed_attrs = nil
|
|
253
|
+
if @source =~ /<(channel|feed)([\s][^>]*)?>/mi
|
|
254
|
+
feed_attrs = Regexp.last_match(2)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
@@feed_tags.each do |tag|
|
|
258
|
+
next if tag.to_s.strip.empty?
|
|
259
|
+
|
|
260
|
+
tag_str = tag.to_s
|
|
261
|
+
|
|
262
|
+
# Handle channel#attr or feed#attr syntax
|
|
263
|
+
if tag_str.include?("#")
|
|
264
|
+
parse_feed_attr_tag(tag_str, feed_attrs)
|
|
265
|
+
next
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
if feed_content && feed_content =~ %r{<(rss:|atom:)?#{tag}(.*?)>(.*?)</(rss:|atom:)?#{tag}>}mi
|
|
269
|
+
nil
|
|
270
|
+
elsif feed_content && feed_content =~ %r{<(rss:|atom:)?#{tag}(.*?)\/\s*>}mi
|
|
271
|
+
nil
|
|
272
|
+
elsif @source =~ %r{<(rss:|atom:)?#{tag}(.*?)>(.*?)</(rss:|atom:)?#{tag}>}mi
|
|
273
|
+
nil
|
|
274
|
+
elsif @source =~ %r{<(rss:|atom:)?#{tag}(.*?)\/\s*>}mi
|
|
275
|
+
nil
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
next unless Regexp.last_match(2) || Regexp.last_match(3)
|
|
279
|
+
|
|
280
|
+
tag_cleaned = clean_tag(tag)
|
|
281
|
+
instance_variable_set("@#{tag_cleaned}", clean_content(tag, Regexp.last_match(2), Regexp.last_match(3)))
|
|
282
|
+
self.class.class_eval("attr_reader :#{tag_cleaned}")
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# RSS items' title, link, and description
|
|
286
|
+
@source.scan(%r{<(rss:|atom:)?(item|entry)([\s][^>]*)?>(.*?)</(rss:|atom:)?(item|entry)>}mi) do |match|
|
|
287
|
+
item = {} #: Hash[Symbol, untyped]
|
|
288
|
+
@@item_tags.each do |tag|
|
|
289
|
+
next if tag.to_s.strip.empty?
|
|
290
|
+
|
|
291
|
+
parse_item_tag(item, tag, match[3], match[2])
|
|
292
|
+
end
|
|
293
|
+
item.define_singleton_method(:method_missing) { |name, *| self[name] }
|
|
294
|
+
@items << item
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# @rbs (Hash[Symbol, untyped], Symbol, String?, String?) -> void
|
|
299
|
+
def parse_item_tag(item, tag, content, item_attrs = nil)
|
|
300
|
+
return if content.nil?
|
|
301
|
+
|
|
302
|
+
tag_str = tag.to_s
|
|
303
|
+
|
|
304
|
+
return parse_rel_tag(item, tag_str, content) if tag_str.include?("+")
|
|
305
|
+
return parse_attr_tag(item, tag_str, content, item_attrs) if tag_str.include?("#")
|
|
306
|
+
|
|
307
|
+
parse_simple_tag(item, tag, content)
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# @rbs (Hash[Symbol, untyped], String, String) -> void
|
|
311
|
+
def parse_rel_tag(item, tag_str, content)
|
|
312
|
+
tag, rel = tag_str.split("+")
|
|
313
|
+
return unless tag && rel
|
|
314
|
+
|
|
315
|
+
content =~ %r{<(rss:|atom:)?#{tag}(.*?)rel=['"]#{rel}['"](.*?)>(.*?)</(rss:|atom:)?#{tag}>}mi ||
|
|
316
|
+
content =~ %r{<(rss:|atom:)?#{tag}(.*?)rel=['"]#{rel}['"](.*?)/\s*>}mi
|
|
317
|
+
|
|
318
|
+
return unless Regexp.last_match(3) || Regexp.last_match(4)
|
|
319
|
+
|
|
320
|
+
item[clean_tag("#{tag}+#{rel}")] = clean_content(tag.to_sym, Regexp.last_match(3), Regexp.last_match(4))
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# @rbs (String, String?) -> void
|
|
324
|
+
def parse_feed_attr_tag(tag_str, feed_attrs)
|
|
325
|
+
tag, attrib = tag_str.split("#")
|
|
326
|
+
return unless tag && attrib && feed_attrs
|
|
327
|
+
|
|
328
|
+
# Only handle channel or feed tags
|
|
329
|
+
return unless %w[channel feed].include?(tag)
|
|
330
|
+
return unless feed_attrs =~ /#{attrib}=['"](.*?)['"]/mi
|
|
331
|
+
|
|
332
|
+
tag_cleaned = clean_tag("#{tag}_#{attrib}")
|
|
333
|
+
instance_variable_set("@#{tag_cleaned}", clean_content(tag.to_sym, attrib, Regexp.last_match(1)))
|
|
334
|
+
self.class.class_eval("attr_reader :#{tag_cleaned}")
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
# @rbs (Hash[Symbol, untyped], String, String, String?) -> void
|
|
338
|
+
def parse_attr_tag(item, tag_str, content, item_attrs = nil)
|
|
339
|
+
tag, attrib = tag_str.split("#")
|
|
340
|
+
return unless tag && attrib
|
|
341
|
+
|
|
342
|
+
# Handle attributes on the item/entry tag itself
|
|
343
|
+
if %w[item entry].include?(tag) && item_attrs
|
|
344
|
+
return unless item_attrs =~ /#{attrib}=['"](.*?)['"]/mi
|
|
345
|
+
|
|
346
|
+
item[clean_tag("#{tag}_#{attrib}")] = clean_content(tag.to_sym, attrib, Regexp.last_match(1))
|
|
347
|
+
return
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
content =~ %r{<(rss:|atom:)?#{tag}(.*?)#{attrib}=['"](.*?)['"](.*?)>(.*?)</(rss:|atom:)?#{tag}>}mi ||
|
|
351
|
+
content =~ %r{<(rss:|atom:)?#{tag}(.*?)#{attrib}=['"](.*?)['"](.*?)/\s*>}mi
|
|
352
|
+
|
|
353
|
+
return unless Regexp.last_match(3)
|
|
354
|
+
|
|
355
|
+
item[clean_tag("#{tag}_#{attrib}")] = clean_content(tag.to_sym, attrib, Regexp.last_match(3))
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
# @rbs (Hash[Symbol, untyped], Symbol, String) -> void
|
|
359
|
+
def parse_simple_tag(item, tag, content)
|
|
360
|
+
# Handle array_tags option - collect all values for this tag
|
|
361
|
+
if array_tag?(tag)
|
|
362
|
+
values = content.scan(%r{<(rss:|atom:)?#{tag}(?:[^>]*)>(.*?)</(rss:|atom:)?#{tag}>}mi).map do |match|
|
|
363
|
+
clean_content(tag, nil, match[1])
|
|
364
|
+
end
|
|
365
|
+
item[clean_tag(tag)] = values unless values.empty?
|
|
366
|
+
return
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
content =~ %r{<(rss:|atom:)?#{tag}(.*?)>(.*?)</(rss:|atom:)?#{tag}>}mi ||
|
|
370
|
+
content =~ %r{<(rss:|atom:)?#{tag}(.*?)/\s*>}mi
|
|
371
|
+
|
|
372
|
+
return unless Regexp.last_match(2) || Regexp.last_match(3)
|
|
373
|
+
|
|
374
|
+
item[clean_tag(tag)] = clean_content(tag, Regexp.last_match(2), Regexp.last_match(3))
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
# @rbs (Symbol) -> bool
|
|
378
|
+
def array_tag?(tag)
|
|
379
|
+
array_tags = @options[:array_tags]
|
|
380
|
+
return false unless array_tags.is_a?(Array)
|
|
381
|
+
|
|
382
|
+
array_tags.include?(tag) || array_tags.include?(tag.to_sym)
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
# @rbs (Symbol, String?, String?) -> (Time | String)
|
|
386
|
+
def clean_content(tag, attrs, content)
|
|
387
|
+
content = content.to_s
|
|
388
|
+
|
|
389
|
+
return parse_date(content) if DATE_TAGS.include?(tag)
|
|
390
|
+
return unescape(content.gsub(/<.*?>/, "")) if STRIP_HTML_TAGS.include?(tag)
|
|
391
|
+
return extract_href(attrs) if content.empty? && attrs
|
|
392
|
+
|
|
393
|
+
unescape(content)
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
# @rbs (String) -> (Time | String)
|
|
397
|
+
def parse_date(content)
|
|
398
|
+
Time.parse(content)
|
|
399
|
+
rescue StandardError
|
|
400
|
+
unescape(content)
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
# @rbs (String?) -> String
|
|
404
|
+
def extract_href(attrs)
|
|
405
|
+
return "" unless "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi
|
|
406
|
+
|
|
407
|
+
Regexp.last_match(1)&.strip || ""
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# @rbs (Symbol | String) -> Symbol
|
|
411
|
+
def clean_tag(tag)
|
|
412
|
+
tag.to_s.tr(":", "_").intern
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# @rbs (untyped) -> untyped
|
|
416
|
+
def serialize_value(value)
|
|
417
|
+
case value
|
|
418
|
+
when Time then value.iso8601
|
|
419
|
+
else value
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# @rbs (String?) -> String
|
|
424
|
+
def escape_xml(text)
|
|
425
|
+
return "" if text.nil?
|
|
426
|
+
|
|
427
|
+
text.to_s
|
|
428
|
+
.gsub("&", "&")
|
|
429
|
+
.gsub("<", "<")
|
|
430
|
+
.gsub(">", ">")
|
|
431
|
+
.gsub("'", "'")
|
|
432
|
+
.gsub('"', """)
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
# @rbs (Array[String], String, untyped) -> void
|
|
436
|
+
def add_xml_element(elements, tag, value)
|
|
437
|
+
elements << "<#{tag}>#{escape_xml(value)}</#{tag}>" if value
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# @rbs (Array[String], String, untyped, Symbol) -> void
|
|
441
|
+
def add_xml_time_element(elements, tag, value, format)
|
|
442
|
+
return unless value.is_a?(Time)
|
|
443
|
+
|
|
444
|
+
formatted = format == :rfc2822 ? value.rfc2822 : value.iso8601
|
|
445
|
+
elements << "<#{tag}>#{formatted}</#{tag}>"
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
# @rbs () -> String
|
|
449
|
+
def to_rss2_xml
|
|
450
|
+
xml = ['<?xml version="1.0" encoding="UTF-8"?>', '<rss version="2.0">', "<channel>"]
|
|
451
|
+
xml.concat(rss2_channel_elements)
|
|
452
|
+
items.each { |item| xml.concat(rss2_item_elements(item)) }
|
|
453
|
+
xml << "</channel>"
|
|
454
|
+
xml << "</rss>"
|
|
455
|
+
xml.join("\n")
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
# @rbs () -> Array[String]
|
|
459
|
+
def rss2_channel_elements
|
|
460
|
+
elements = [] #: Array[String]
|
|
461
|
+
add_xml_element(elements, "title", instance_variable_get(:@title))
|
|
462
|
+
add_xml_element(elements, "link", instance_variable_get(:@link))
|
|
463
|
+
add_xml_element(elements, "description", instance_variable_get(:@description))
|
|
464
|
+
add_xml_element(elements, "language", instance_variable_get(:@language))
|
|
465
|
+
add_xml_time_element(elements, "pubDate", instance_variable_get(:@pubDate), :rfc2822)
|
|
466
|
+
add_xml_time_element(elements, "lastBuildDate", instance_variable_get(:@lastBuildDate), :rfc2822)
|
|
467
|
+
add_xml_element(elements, "generator", instance_variable_get(:@generator))
|
|
468
|
+
elements
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
# @rbs (Hash[Symbol, untyped]) -> Array[String]
|
|
472
|
+
def rss2_item_elements(item)
|
|
473
|
+
elements = ["<item>"] #: Array[String]
|
|
474
|
+
elements << "<title>#{escape_xml(item[:title])}</title>" if item[:title]
|
|
475
|
+
elements << "<link>#{escape_xml(item[:link])}</link>" if item[:link]
|
|
476
|
+
elements << "<description><![CDATA[#{item[:description]}]]></description>" if item[:description]
|
|
477
|
+
elements << "<pubDate>#{item[:pubDate].rfc2822}</pubDate>" if item[:pubDate].is_a?(Time)
|
|
478
|
+
elements << "<guid>#{escape_xml(item[:guid])}</guid>" if item[:guid]
|
|
479
|
+
elements << "<author>#{escape_xml(item[:author])}</author>" if item[:author]
|
|
480
|
+
elements << "<category>#{escape_xml(item[:category])}</category>" if item[:category]
|
|
481
|
+
elements << "</item>"
|
|
482
|
+
elements
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# @rbs () -> String
|
|
486
|
+
def to_atom_xml
|
|
487
|
+
xml = ['<?xml version="1.0" encoding="UTF-8"?>', '<feed xmlns="http://www.w3.org/2005/Atom">']
|
|
488
|
+
xml.concat(atom_feed_elements)
|
|
489
|
+
items.each { |item| xml.concat(atom_entry_elements(item)) }
|
|
490
|
+
xml << "</feed>"
|
|
491
|
+
xml.join("\n")
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
# @rbs () -> Array[String]
|
|
495
|
+
def atom_feed_elements
|
|
496
|
+
elements = [] #: Array[String]
|
|
497
|
+
title_val = instance_variable_get(:@title)
|
|
498
|
+
link_val = instance_variable_get(:@link)
|
|
499
|
+
id_val = instance_variable_get(:@id)
|
|
500
|
+
add_xml_element(elements, "title", title_val)
|
|
501
|
+
elements << "<link href=\"#{escape_xml(link_val)}\" rel=\"alternate\"/>" if link_val
|
|
502
|
+
elements << "<id>#{escape_xml(id_val || link_val)}</id>" if link_val
|
|
503
|
+
add_xml_time_element(elements, "updated", instance_variable_get(:@updated), :iso8601)
|
|
504
|
+
add_xml_element(elements, "subtitle", instance_variable_get(:@subtitle))
|
|
505
|
+
author_val = instance_variable_get(:@author)
|
|
506
|
+
elements << "<author><name>#{escape_xml(author_val)}</name></author>" if author_val
|
|
507
|
+
add_xml_element(elements, "generator", instance_variable_get(:@generator))
|
|
508
|
+
elements
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
# @rbs (Hash[Symbol, untyped]) -> Array[String]
|
|
512
|
+
def atom_entry_elements(item)
|
|
513
|
+
elements = ["<entry>"] #: Array[String]
|
|
514
|
+
elements << "<title>#{escape_xml(item[:title])}</title>" if item[:title]
|
|
515
|
+
elements << "<link href=\"#{escape_xml(item[:link])}\" rel=\"alternate\"/>" if item[:link]
|
|
516
|
+
elements << "<id>#{escape_xml(item[:id] || item[:guid] || item[:link])}</id>" if item[:id] || item[:guid] || item[:link]
|
|
517
|
+
elements << "<updated>#{item[:updated].iso8601}</updated>" if item[:updated].is_a?(Time)
|
|
518
|
+
atom_entry_published(elements, item)
|
|
519
|
+
elements << "<summary><![CDATA[#{item[:summary] || item[:description]}]]></summary>" if item[:summary] || item[:description]
|
|
520
|
+
elements << "<content><![CDATA[#{item[:content]}]]></content>" if item[:content]
|
|
521
|
+
elements << "<author><name>#{escape_xml(item[:author])}</name></author>" if item[:author]
|
|
522
|
+
elements << "<category term=\"#{escape_xml(item[:category])}\"/>" if item[:category]
|
|
523
|
+
elements << "</entry>"
|
|
524
|
+
elements
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
# @rbs (Array[String], Hash[Symbol, untyped]) -> void
|
|
528
|
+
def atom_entry_published(elements, item)
|
|
529
|
+
if item[:published].is_a?(Time)
|
|
530
|
+
elements << "<published>#{item[:published].iso8601}</published>"
|
|
531
|
+
elsif item[:pubDate].is_a?(Time)
|
|
532
|
+
elements << "<published>#{item[:pubDate].iso8601}</published>"
|
|
533
|
+
end
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# @rbs (String) -> String
|
|
152
537
|
def unescape(content)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
538
|
+
result = if content =~ %r{([^-_.!~*'()a-zA-Z\d;/?:@&=+$,\[\]]%)}
|
|
539
|
+
CGI.unescape(content)
|
|
540
|
+
else
|
|
541
|
+
content
|
|
542
|
+
end.gsub(/(<!\[CDATA\[|\]\]>)/, "").strip
|
|
543
|
+
|
|
544
|
+
result.encode(Encoding::UTF_8)
|
|
158
545
|
end
|
|
159
546
|
end
|
|
160
547
|
|