elisehuard-media_feed 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/media_feed.rb ADDED
@@ -0,0 +1,189 @@
1
+ =begin rdoc
2
+ The media_feed library parses a media rss stream and stores the information in its attributes.
3
+ == Examples:
4
+ Fetch a whole feed
5
+ media_feed = MediaFeed::Feed.new('http://www.nasa.gov/rss/NASAcast_vodcast.rss ')
6
+ media_feed.fetch
7
+ Fetch starting at a certain date (date must be in RFC2822 format - use Time.rfc2822)
8
+ media_feed = MediaFeed::Feed.new('http://www.nasa.gov/rss/NASAcast_vodcast.rss ')
9
+ media_feed.fetch_since("Wed, 10 Sep 2008 10:29:18 +0200")
10
+ Use the retrieved information
11
+ thumbnail = media_feed.thumbnail
12
+ media_feed.items.each do |item|
13
+ ...
14
+ end
15
+ =end
16
+
17
+ require 'rubygems'
18
+ require 'open-uri'
19
+ require 'libxml'
20
+
21
+ module MediaFeed
22
+ # Exception for an invalid URL format
23
+ class InvalidUrl < Exception
24
+ end
25
+
26
+ # Exception for an invalid Item
27
+ class InvalidMediaItem < Exception
28
+ end
29
+
30
+ # Exception for if feed url returns an empty result or a HTTP error
31
+ class FeedNotFound < Exception
32
+ end
33
+
34
+ # Exception for invalid xml
35
+ class InvalidXML < Exception
36
+ end
37
+
38
+ # Exception when receiving an invalid date (non-RFC2822) for input
39
+ class InvalidDate < Exception
40
+ end
41
+
42
+ # Mediafeeds handles the retrieval of feed information. The feed attributes are stored
43
+ # in feed attributes, and the individual items are stored in an array of Item objects.
44
+ class Feed
45
+ include(LibXML)
46
+
47
+ attr_reader :title, :description, :thumbnail, :pubDate, :items
48
+
49
+ # Feed is initialized with a valid url for the feed itself.
50
+ def initialize(url)
51
+ raise ArgumentException if url.nil?
52
+ raise InvalidUrl if url !~ /(^(http|https):\/\/[a-z0-9]+([-.]{1}[a-z0-9]*)+. [a-z]{2,5}(([0-9]{1,5})?\/.*)?$)/ix
53
+ @url = url
54
+ @entries = nil
55
+ end
56
+
57
+ # fetch the whole media feed
58
+ def fetch
59
+ doc = get_feed
60
+ return nil if doc.nil?
61
+ parse_feed(doc)
62
+ end
63
+
64
+ # fetch all entries of the media feed since a certain date (RFC2822 format).
65
+ def fetch_since(last_date)
66
+ @last_date = parse_date(last_date)
67
+ fetch
68
+ end
69
+
70
+ def to_s
71
+ "title #{@title}, description #{@description}" +
72
+ @items.inject('') do |str,item|
73
+ str + entry.to_s
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def get_feed
80
+ response = ''
81
+ open(@url){ |f|
82
+ # Save the response body
83
+ response = f.read
84
+ }
85
+ raise FeedNotFound if response.nil? || response.empty?
86
+ response
87
+ rescue OpenURI::HTTPError
88
+ raise FeedNotFound
89
+ end
90
+
91
+ # for some strange reason, the xml parsing by libxml stumbles over & symbols. cute.
92
+ def preprocess(doc)
93
+ doc.gsub!(/&/,'&amp;')
94
+ doc
95
+ end
96
+
97
+ def parse_feed(doc)
98
+ all_nodes = parse_document(preprocess(doc))
99
+ channel = all_nodes[0]
100
+ @title = content(channel,'title')
101
+ @description = content(channel,'description')
102
+ @thumbnail = content(channel,'image/url')
103
+ @pubDate = content(channel,'pubDate')
104
+ if @pubDate && @last_date
105
+ feed_pub_date = parse_date(@pubDate)
106
+ return if feed_pub_date == @last_date
107
+ end
108
+ items = channel.find('//item')
109
+
110
+ @items = items.inject([]) do |result,node|
111
+ item = parse_item(node)
112
+ if !item.valid? # invalid items shouldn't be added to feed but shouldn't stop the rest of the feed
113
+ puts "Faulty item in the feed - not loaded : "
114
+ puts item.to_s
115
+ else
116
+ result << item if item.is_after?(@last_date)
117
+ end
118
+ result
119
+ end
120
+ end
121
+
122
+ def parse_document(doc)
123
+ parser = LibXML::XML::Parser.new
124
+ parser.string = doc
125
+ xml = parser.parse
126
+ root = xml.root
127
+ root.find('*')
128
+ rescue
129
+ raise InvalidXML
130
+ end
131
+
132
+ def parse_item(node)
133
+ item = Item.new
134
+ item.title = content(node,'title')
135
+ item.description = content(node,'description')
136
+ item.link = content(node,'link')
137
+ item.enclosure = url(node,'enclosure') || url(node,'media:content') || content(node,'guid')
138
+ date = content(node,'pubDate')
139
+ item.pubDate = parse_date(date) if date && !date.empty?
140
+ item.thumbnail = content(node,'image/url')
141
+
142
+ # pubDate if not filled in by feed -> first (valid) one is probably most recent
143
+ @pubDate = item.pubDate.rfc2822 if @pubDate.nil? && item.pubDate
144
+ item
145
+ end
146
+
147
+ def content(parent_node,xpath)
148
+ parent_node.find(xpath)[0].content if parent_node.find(xpath) && parent_node.find(xpath)[0]
149
+ end
150
+
151
+ def url(parent_node,xpath)
152
+ # ,['media:http://search.yahoo.com/mrss']
153
+ parent_node.find(xpath)[0]['url'] if parent_node.find(xpath) && parent_node.find(xpath)[0]
154
+ end
155
+
156
+ # date string should be in rfc2822 format ex. Sat, 06 Sep 2008 11:59:15 +0000
157
+ def parse_date(date_string)
158
+ time = Time.rfc2822(date_string)
159
+ rescue => e
160
+ raise InvalidDate
161
+ end
162
+
163
+ end
164
+
165
+ class Item
166
+ attr_accessor :title, :description, :link, :pubDate, :enclosure, :thumbnail
167
+
168
+ def valid?
169
+ if title.nil? || link.nil? || enclosure.nil?
170
+ false
171
+ else
172
+ true
173
+ end
174
+ end
175
+
176
+ def is_after?(last_date)
177
+ (last_date.nil? || pubDate.nil? || (pubDate > last_date))
178
+ end
179
+
180
+ def to_s
181
+ "title #{title}\n" +
182
+ "description #{description}\n" +
183
+ "link #{link}\n" +
184
+ "enclosure #{enclosure}\n" +
185
+ "pubDate #{pubDate}\n"
186
+ end
187
+
188
+ end
189
+ end