elisehuard-media_feed 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/media_feed.rb +189 -0
- data/spec/feeds/ChrisPirilloShow +948 -0
- data/spec/feeds/NASAcast_vodcast +4 -0
- data/spec/feeds/diggnation +850 -0
- data/spec/feeds/tedtalks_video +4272 -0
- data/spec/media_feed_spec.rb +105 -0
- data/spec/spec_helper.rb +14 -0
- metadata +66 -0
data/lib/media_feed.rb
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
The media_feed library parses a media rss stream and stores the information in its attributes.
|
3
|
+
== Examples:
|
4
|
+
Fetch a whole feed
|
5
|
+
media_feed = MediaFeed::Feed.new('http://www.nasa.gov/rss/NASAcast_vodcast.rss ')
|
6
|
+
media_feed.fetch
|
7
|
+
Fetch starting at a certain date (date must be in RFC2822 format - use Time.rfc2822)
|
8
|
+
media_feed = MediaFeed::Feed.new('http://www.nasa.gov/rss/NASAcast_vodcast.rss ')
|
9
|
+
media_feed.fetch_since("Wed, 10 Sep 2008 10:29:18 +0200")
|
10
|
+
Use the retrieved information
|
11
|
+
thumbnail = media_feed.thumbnail
|
12
|
+
media_feed.items.each do |item|
|
13
|
+
...
|
14
|
+
end
|
15
|
+
=end
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'open-uri'
|
19
|
+
require 'libxml'
|
20
|
+
|
21
|
+
module MediaFeed
|
22
|
+
# Exception for an invalid URL format
|
23
|
+
class InvalidUrl < Exception
|
24
|
+
end
|
25
|
+
|
26
|
+
# Exception for an invalid Item
|
27
|
+
class InvalidMediaItem < Exception
|
28
|
+
end
|
29
|
+
|
30
|
+
# Exception for if feed url returns an empty result or a HTTP error
|
31
|
+
class FeedNotFound < Exception
|
32
|
+
end
|
33
|
+
|
34
|
+
# Exception for invalid xml
|
35
|
+
class InvalidXML < Exception
|
36
|
+
end
|
37
|
+
|
38
|
+
# Exception when receiving an invalid date (non-RFC2822) for input
|
39
|
+
class InvalidDate < Exception
|
40
|
+
end
|
41
|
+
|
42
|
+
# Mediafeeds handles the retrieval of feed information. The feed attributes are stored
|
43
|
+
# in feed attributes, and the individual items are stored in an array of Item objects.
|
44
|
+
class Feed
|
45
|
+
include(LibXML)
|
46
|
+
|
47
|
+
attr_reader :title, :description, :thumbnail, :pubDate, :items
|
48
|
+
|
49
|
+
# Feed is initialized with a valid url for the feed itself.
|
50
|
+
def initialize(url)
|
51
|
+
raise ArgumentException if url.nil?
|
52
|
+
raise InvalidUrl if url !~ /(^(http|https):\/\/[a-z0-9]+([-.]{1}[a-z0-9]*)+. [a-z]{2,5}(([0-9]{1,5})?\/.*)?$)/ix
|
53
|
+
@url = url
|
54
|
+
@entries = nil
|
55
|
+
end
|
56
|
+
|
57
|
+
# fetch the whole media feed
|
58
|
+
def fetch
|
59
|
+
doc = get_feed
|
60
|
+
return nil if doc.nil?
|
61
|
+
parse_feed(doc)
|
62
|
+
end
|
63
|
+
|
64
|
+
# fetch all entries of the media feed since a certain date (RFC2822 format).
|
65
|
+
def fetch_since(last_date)
|
66
|
+
@last_date = parse_date(last_date)
|
67
|
+
fetch
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_s
|
71
|
+
"title #{@title}, description #{@description}" +
|
72
|
+
@items.inject('') do |str,item|
|
73
|
+
str + entry.to_s
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def get_feed
|
80
|
+
response = ''
|
81
|
+
open(@url){ |f|
|
82
|
+
# Save the response body
|
83
|
+
response = f.read
|
84
|
+
}
|
85
|
+
raise FeedNotFound if response.nil? || response.empty?
|
86
|
+
response
|
87
|
+
rescue OpenURI::HTTPError
|
88
|
+
raise FeedNotFound
|
89
|
+
end
|
90
|
+
|
91
|
+
# for some strange reason, the xml parsing by libxml stumbles over & symbols. cute.
|
92
|
+
def preprocess(doc)
|
93
|
+
doc.gsub!(/&/,'&')
|
94
|
+
doc
|
95
|
+
end
|
96
|
+
|
97
|
+
def parse_feed(doc)
|
98
|
+
all_nodes = parse_document(preprocess(doc))
|
99
|
+
channel = all_nodes[0]
|
100
|
+
@title = content(channel,'title')
|
101
|
+
@description = content(channel,'description')
|
102
|
+
@thumbnail = content(channel,'image/url')
|
103
|
+
@pubDate = content(channel,'pubDate')
|
104
|
+
if @pubDate && @last_date
|
105
|
+
feed_pub_date = parse_date(@pubDate)
|
106
|
+
return if feed_pub_date == @last_date
|
107
|
+
end
|
108
|
+
items = channel.find('//item')
|
109
|
+
|
110
|
+
@items = items.inject([]) do |result,node|
|
111
|
+
item = parse_item(node)
|
112
|
+
if !item.valid? # invalid items shouldn't be added to feed but shouldn't stop the rest of the feed
|
113
|
+
puts "Faulty item in the feed - not loaded : "
|
114
|
+
puts item.to_s
|
115
|
+
else
|
116
|
+
result << item if item.is_after?(@last_date)
|
117
|
+
end
|
118
|
+
result
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def parse_document(doc)
|
123
|
+
parser = LibXML::XML::Parser.new
|
124
|
+
parser.string = doc
|
125
|
+
xml = parser.parse
|
126
|
+
root = xml.root
|
127
|
+
root.find('*')
|
128
|
+
rescue
|
129
|
+
raise InvalidXML
|
130
|
+
end
|
131
|
+
|
132
|
+
def parse_item(node)
|
133
|
+
item = Item.new
|
134
|
+
item.title = content(node,'title')
|
135
|
+
item.description = content(node,'description')
|
136
|
+
item.link = content(node,'link')
|
137
|
+
item.enclosure = url(node,'enclosure') || url(node,'media:content') || content(node,'guid')
|
138
|
+
date = content(node,'pubDate')
|
139
|
+
item.pubDate = parse_date(date) if date && !date.empty?
|
140
|
+
item.thumbnail = content(node,'image/url')
|
141
|
+
|
142
|
+
# pubDate if not filled in by feed -> first (valid) one is probably most recent
|
143
|
+
@pubDate = item.pubDate.rfc2822 if @pubDate.nil? && item.pubDate
|
144
|
+
item
|
145
|
+
end
|
146
|
+
|
147
|
+
def content(parent_node,xpath)
|
148
|
+
parent_node.find(xpath)[0].content if parent_node.find(xpath) && parent_node.find(xpath)[0]
|
149
|
+
end
|
150
|
+
|
151
|
+
def url(parent_node,xpath)
|
152
|
+
# ,['media:http://search.yahoo.com/mrss']
|
153
|
+
parent_node.find(xpath)[0]['url'] if parent_node.find(xpath) && parent_node.find(xpath)[0]
|
154
|
+
end
|
155
|
+
|
156
|
+
# date string should be in rfc2822 format ex. Sat, 06 Sep 2008 11:59:15 +0000
|
157
|
+
def parse_date(date_string)
|
158
|
+
time = Time.rfc2822(date_string)
|
159
|
+
rescue => e
|
160
|
+
raise InvalidDate
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
class Item
|
166
|
+
attr_accessor :title, :description, :link, :pubDate, :enclosure, :thumbnail
|
167
|
+
|
168
|
+
def valid?
|
169
|
+
if title.nil? || link.nil? || enclosure.nil?
|
170
|
+
false
|
171
|
+
else
|
172
|
+
true
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def is_after?(last_date)
|
177
|
+
(last_date.nil? || pubDate.nil? || (pubDate > last_date))
|
178
|
+
end
|
179
|
+
|
180
|
+
def to_s
|
181
|
+
"title #{title}\n" +
|
182
|
+
"description #{description}\n" +
|
183
|
+
"link #{link}\n" +
|
184
|
+
"enclosure #{enclosure}\n" +
|
185
|
+
"pubDate #{pubDate}\n"
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|