openlogic-feed-normalizer 1.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,117 @@
1
+ require 'rss'
2
+
3
+ # For some reason, this is only included in the RDF Item by default (in 0.1.6).
4
+ unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
5
+ class RSS::Rss::Channel::Item # :nodoc:
6
+ include RSS::ContentModel
7
+ end
8
+ end
9
+
10
+ # Add equality onto Enclosures.
11
+ class RSS::Rss::Channel::Item::Enclosure
12
+ def eql?(enc)
13
+ instance_variables.all? do |iv|
14
+ instance_variable_get(iv) == enc.instance_variable_get(iv)
15
+ end
16
+ end
17
+
18
+ alias == eql?
19
+ end
20
+
21
+ module FeedNormalizer
22
+ class RubyRssParser < Parser
23
+
24
+ def self.parser
25
+ RSS::Parser
26
+ end
27
+
28
+ def self.parse(xml, loose)
29
+ begin
30
+ rss = parser.parse(xml)
31
+ rescue Exception => e
32
+ #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
33
+ return nil
34
+ end
35
+
36
+ # check for channel to make sure we're only dealing with RSS.
37
+ rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
38
+ end
39
+
40
+ # Fairly high priority; a fast and strict parser.
41
+ def self.priority
42
+ 100
43
+ end
44
+
45
+ protected
46
+
47
+ def self.package(rss, loose)
48
+ feed = Feed.new(self)
49
+
50
+ # channel elements
51
+ feed_mapping = {
52
+ :generator => :generator,
53
+ :title => :title,
54
+ :urls => :link,
55
+ :description => :description,
56
+ :copyright => :copyright,
57
+ :authors => :managingEditor,
58
+ :last_updated => [:lastBuildDate, :pubDate, :dc_date],
59
+ :id => :guid,
60
+ :ttl => :ttl
61
+ }
62
+
63
+ # make two passes, to catch all possible root elements
64
+ map_functions!(feed_mapping, rss, feed)
65
+ map_functions!(feed_mapping, rss.channel, feed)
66
+
67
+ # custom channel elements
68
+ feed.image = rss.image ? rss.image.url : nil
69
+ feed.skip_hours = skip(rss, :skipHours)
70
+ feed.skip_days = skip(rss, :skipDays)
71
+
72
+ # item elements
73
+ item_mapping = {
74
+ :date_published => [:pubDate, :dc_date],
75
+ :urls => :link,
76
+ :enclosures => :enclosure,
77
+ :description => :description,
78
+ :content => [:content_encoded, :description],
79
+ :title => :title,
80
+ :authors => [:author, :dc_creator],
81
+ :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
82
+ }
83
+
84
+ rss.items.each do |rss_item|
85
+ unless rss_item.title.nil? && rss_item.description.nil? # some feeds return empty items
86
+ feed_entry = Entry.new
87
+ map_functions!(item_mapping, rss_item, feed_entry)
88
+
89
+ # custom item elements
90
+ feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
91
+ # fall back to link for ID
92
+ feed_entry.id ||= rss_item.link if rss_item.respond_to?(:link) && rss_item.link
93
+ feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
94
+ feed_entry.categories = loose ?
95
+ rss_item.categories.collect{|c|c.content} :
96
+ [rss_item.categories.first.content] rescue []
97
+
98
+ feed.entries << feed_entry
99
+ end
100
+ end
101
+
102
+ feed
103
+ end
104
+
105
+ def self.skip(parser, attribute)
106
+ case attribute
107
+ when :skipHours then attributes = :hours
108
+ when :skipDays then attributes = :days
109
+ end
110
+ channel = parser.channel
111
+
112
+ return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
113
+ a.send(attributes).collect{|e| e.content}
114
+ end
115
+
116
+ end
117
+ end
@@ -0,0 +1,142 @@
1
+ require 'simple-rss'
2
+
3
+ # Monkey patches for outstanding issues logged in the simple-rss project.
4
+ # * Add support for issued time field:
5
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
6
+ # * The '+' symbol is lost when escaping fields.
7
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
8
+ #
9
+ class SimpleRSS
10
+ @@item_tags << :issued
11
+
12
+ undef clean_content
13
+ def clean_content(tag, attrs, content)
14
+ content = content.to_s
15
+ case tag
16
+ when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
17
+ Time.parse(content) rescue unescape(content)
18
+ when :author, :contributor, :skipHours, :skipDays
19
+ unescape(content.gsub(/<.*?>/,''))
20
+ else
21
+ content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
22
+ end
23
+ end
24
+
25
+ undef unescape
26
+ def unescape(s)
27
+ if s =~ /^\s*(<!\[CDATA\[|\]\]>)/
28
+ # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
29
+ s.gsub(/(<!\[CDATA\[|\]\]>)/,'')
30
+ elsif s =~ /[<>]/
31
+ # Already looks like HTML.
32
+ s
33
+ else
34
+ # Make it HTML.
35
+ FeedNormalizer::HtmlCleaner.unescapeHTML(s)
36
+ end
37
+ end
38
+ end
39
+
40
+ module FeedNormalizer
41
+
42
+ # The SimpleRSS parser can handle both RSS and Atom feeds.
43
+ class SimpleRssParser < Parser
44
+
45
+ def self.parser
46
+ SimpleRSS
47
+ end
48
+
49
+ def self.parse(xml, loose)
50
+ begin
51
+ atomrss = parser.parse(xml)
52
+ rescue Exception => e
53
+ #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
54
+ return nil
55
+ end
56
+
57
+ package(atomrss)
58
+ end
59
+
60
+ # Fairly low priority; a slower, liberal parser.
61
+ def self.priority
62
+ 900
63
+ end
64
+
65
+ protected
66
+
67
+ def self.package(atomrss)
68
+ feed = Feed.new(self)
69
+
70
+ # root elements
71
+ feed_mapping = {
72
+ :generator => :generator,
73
+ :title => :title,
74
+ :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
75
+ :copyright => [:copyright, :rights],
76
+ :authors => [:author, :webMaster, :managingEditor, :contributor],
77
+ :urls => [:'link+alternate', :link],
78
+ :description => [:description, :subtitle],
79
+ :ttl => :ttl
80
+ }
81
+
82
+ map_functions!(feed_mapping, atomrss, feed)
83
+
84
+ # custom channel elements
85
+ feed.id = feed_id(atomrss)
86
+ feed.image = image(atomrss)
87
+
88
+
89
+ # entry elements
90
+ entry_mapping = {
91
+ :date_published => [:pubDate, :published, :dc_date, :issued],
92
+ :urls => [:'link+alternate', :link],
93
+ :enclosures => :enclosure,
94
+ :description => [:description, :summary],
95
+ :content => [:content, :content_encoded, :description],
96
+ :title => :title,
97
+ :authors => [:author, :contributor, :dc_creator],
98
+ :categories => :category,
99
+ :last_updated => [:updated, :dc_date, :pubDate]
100
+ }
101
+
102
+ atomrss.entries.each do |atomrss_entry|
103
+ unless atomrss_entry.title.nil? && atomrss_entry.description.nil? # some feeds return empty items
104
+ feed_entry = Entry.new
105
+ map_functions!(entry_mapping, atomrss_entry, feed_entry)
106
+
107
+ # custom entry elements
108
+ feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
109
+ # fall back to link for ID
110
+ feed_entry.id ||= atomrss_entry.link if atomrss_entry.respond_to?(:link) && atomrss_entry.link
111
+ feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
112
+
113
+ feed.entries << feed_entry
114
+ end
115
+ end
116
+
117
+ feed
118
+ end
119
+
120
+ def self.image(parser)
121
+ if parser.respond_to?(:image) && parser.image
122
+ if parser.image =~ /<url>/ # RSS image contains an <url> spec
123
+ parser.image.scan(/<url>(.*?)<\/url>/).to_s
124
+ else
125
+ parser.image # Atom contains just the url
126
+ end
127
+ elsif parser.respond_to?(:logo) && parser.logo
128
+ parser.logo
129
+ end
130
+ end
131
+
132
+ def self.feed_id(parser)
133
+ overridden_value(parser, :id) || ("#{parser.link}" if parser.respond_to?(:link))
134
+ end
135
+
136
+ # gets the value returned from the method if it overriden, otherwise nil.
137
+ def self.overridden_value(object, method)
138
+ object.class.public_instance_methods(false).include? method
139
+ end
140
+
141
+ end
142
+ end
data/lib/structures.rb ADDED
@@ -0,0 +1,262 @@
1
+
2
+ module FeedNormalizer
3
+
4
+ module Singular
5
+
6
+ # If the method being called is a singular (in this simple case, does not
7
+ # end with an 's'), then it calls the plural method, and calls the first
8
+ # element. We're assuming that plural methods provide an array.
9
+ #
10
+ # Example:
11
+ # Object contains an array called 'alphas', which looks like [:a, :b, :c].
12
+ # Call object.alpha and :a is returned.
13
+ def method_missing(name, *args)
14
+ plural_name = :"#{name}s"
15
+ return self.send(plural_name).first if respond_to?(plural_name)
16
+ super(name, *args)
17
+ end
18
+
19
+ def respond_to?(x, y=false)
20
+ self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
21
+ end
22
+
23
+ end
24
+
25
+ module ElementEquality
26
+
27
+ def eql?(other)
28
+ self == (other)
29
+ end
30
+
31
+ def ==(other)
32
+ other.equal?(self) ||
33
+ (other.instance_of?(self.class) &&
34
+ self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
35
+ end
36
+
37
+ # Returns the difference between two Feed instances as a hash.
38
+ # Any top-level differences in the Feed object as presented as:
39
+ #
40
+ # { :title => [content, other_content] }
41
+ #
42
+ # For differences at the items level, an array of hashes shows the diffs
43
+ # on a per-entry basis. Only entries that differ will contain a hash:
44
+ #
45
+ # { :items => [
46
+ # {:title => ["An article tile", "A new article title"]},
47
+ # {:title => ["one title", "a different title"]} ]}
48
+ #
49
+ # If the number of items in each feed are different, then the count of each
50
+ # is provided instead:
51
+ #
52
+ # { :items => [4,5] }
53
+ #
54
+ # This method can also be useful for human-readable feed comparison if
55
+ # its output is dumped to YAML.
56
+ def diff(other, elements = self.class::ELEMENTS)
57
+ diffs = {}
58
+
59
+ elements.each do |element|
60
+ if other.respond_to?(element)
61
+ self_value = self.send(element)
62
+ other_value = other.send(element)
63
+
64
+ next if self_value == other_value
65
+
66
+ diffs[element] = if other_value.respond_to?(:diff)
67
+ self_value.diff(other_value)
68
+
69
+ elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
70
+
71
+ if self_value.size != other_value.size
72
+ [self_value.size, other_value.size]
73
+ else
74
+ enum_diffs = []
75
+ self_value.each_with_index do |val, index|
76
+ enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
77
+ end
78
+ enum_diffs.reject{|h| h.empty?}
79
+ end
80
+
81
+ else
82
+ [other_value, self_value] unless other_value == self_value
83
+ end
84
+ end
85
+ end
86
+
87
+ diffs
88
+ end
89
+
90
+ end
91
+
92
+ module ElementCleaner
93
+ # Recursively cleans all elements in place.
94
+ #
95
+ # Only allow tags in whitelist. Always parse the html with a parser and delete
96
+ # all tags that arent on the list.
97
+ #
98
+ # For feed elements that can contain HTML:
99
+ # - feed.(title|description)
100
+ # - feed.entries[n].(title|description|content)
101
+ #
102
+ def clean!
103
+ self.class::SIMPLE_ELEMENTS.each do |element|
104
+ val = self.send(element)
105
+
106
+ send("#{element}=", (val.is_a?(Array) ?
107
+ val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
108
+ end
109
+
110
+ self.class::HTML_ELEMENTS.each do |element|
111
+ send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
112
+ end
113
+
114
+ self.class::BLENDED_ELEMENTS.each do |element|
115
+ self.send(element).collect{|v| v.clean!}
116
+ end
117
+ end
118
+ end
119
+
120
+ module TimeFix
121
+ # Reparse any Time instances, due to RSS::Parser's redefinition of
122
+ # certain aspects of the Time class that creates unexpected behaviour
123
+ # when extending the Time class, as some common third party libraries do.
124
+ # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
125
+ def reparse(obj)
126
+ @parsed ||= false
127
+
128
+ if obj.is_a?(String)
129
+ @parsed = true
130
+ begin
131
+ Time.at(obj) rescue Time.rfc2822(obj) rescue Time.parse(obj)
132
+ rescue
133
+ @parsed = false
134
+ obj
135
+ end
136
+ else
137
+ return obj if @parsed
138
+
139
+ if obj.is_a?(Time)
140
+ @parsed = true
141
+ Time.at(obj) rescue obj
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ module RewriteRelativeLinks
148
+ def rewrite_relative_links(text, url)
149
+ if host = url_host(url)
150
+ text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
151
+ else
152
+ text
153
+ end
154
+ end
155
+
156
+ private
157
+ def url_host(url)
158
+ URI.parse(url).host rescue nil
159
+ end
160
+ end
161
+
162
+
163
+ # Represents a feed item entry.
164
+ # Available fields are:
165
+ # * content
166
+ # * description
167
+ # * title
168
+ # * date_published
169
+ # * urls / url
170
+ # * id
171
+ # * authors / author
172
+ # * copyright
173
+ # * categories
174
+ class Entry
175
+ include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
176
+
177
+ HTML_ELEMENTS = [:content, :description, :title]
178
+ SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated, :enclosures]
179
+ BLENDED_ELEMENTS = []
180
+
181
+ ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
182
+
183
+ attr_accessor(*ELEMENTS)
184
+
185
+ def initialize
186
+ @urls = []
187
+ @authors = []
188
+ @categories = []
189
+ @enclosures = []
190
+ @date_published, @content, @last_updated = nil
191
+ end
192
+
193
+ undef date_published
194
+ def date_published
195
+ @date_published = reparse(@date_published)
196
+ end
197
+
198
+ undef last_updated
199
+ def last_updated
200
+ @last_updated = reparse(@last_updated)
201
+ end
202
+
203
+ undef content
204
+ def content
205
+ @content = rewrite_relative_links(@content, url)
206
+ end
207
+
208
+ end
209
+
210
+ # Represents the root element of a feed.
211
+ # Available fields are:
212
+ # * title
213
+ # * description
214
+ # * id
215
+ # * last_updated
216
+ # * copyright
217
+ # * authors / author
218
+ # * urls / url
219
+ # * image
220
+ # * generator
221
+ # * items / channel
222
+ class Feed
223
+ include Singular, ElementEquality, ElementCleaner, TimeFix
224
+
225
+ # Elements that can contain HTML fragments.
226
+ HTML_ELEMENTS = [:title, :description]
227
+
228
+ # Elements that contain 'plain' Strings, with HTML escaped.
229
+ SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
230
+
231
+ # Elements that contain both HTML and escaped HTML.
232
+ BLENDED_ELEMENTS = [:items]
233
+
234
+ ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
235
+
236
+ attr_accessor(*ELEMENTS)
237
+ attr_accessor(:parser)
238
+
239
+ alias :entries :items
240
+
241
+ def initialize(wrapper)
242
+ # set up associations (i.e. arrays where needed)
243
+ @urls = []
244
+ @authors = []
245
+ @skip_hours = []
246
+ @skip_days = []
247
+ @items = []
248
+ @parser = wrapper.parser.to_s
249
+ @last_updated = nil
250
+ end
251
+
252
+ undef last_updated
253
+ def last_updated
254
+ @last_updated = reparse(@last_updated)
255
+ end
256
+
257
+ def channel() self end
258
+
259
+ end
260
+
261
+ end
262
+