openlogic-feed-normalizer 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,117 @@
1
+ require 'rss'
2
+
3
+ # For some reason, this is only included in the RDF Item by default (in 0.1.6).
4
+ unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
5
+ class RSS::Rss::Channel::Item # :nodoc:
6
+ include RSS::ContentModel
7
+ end
8
+ end
9
+
10
+ # Add equality onto Enclosures.
11
+ class RSS::Rss::Channel::Item::Enclosure
12
+ def eql?(enc)
13
+ instance_variables.all? do |iv|
14
+ instance_variable_get(iv) == enc.instance_variable_get(iv)
15
+ end
16
+ end
17
+
18
+ alias == eql?
19
+ end
20
+
21
+ module FeedNormalizer
22
+ class RubyRssParser < Parser
23
+
24
+ def self.parser
25
+ RSS::Parser
26
+ end
27
+
28
+ def self.parse(xml, loose)
29
+ begin
30
+ rss = parser.parse(xml)
31
+ rescue Exception => e
32
+ #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
33
+ return nil
34
+ end
35
+
36
+ # check for channel to make sure we're only dealing with RSS.
37
+ rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
38
+ end
39
+
40
+ # Fairly high priority; a fast and strict parser.
41
+ def self.priority
42
+ 100
43
+ end
44
+
45
+ protected
46
+
47
+ def self.package(rss, loose)
48
+ feed = Feed.new(self)
49
+
50
+ # channel elements
51
+ feed_mapping = {
52
+ :generator => :generator,
53
+ :title => :title,
54
+ :urls => :link,
55
+ :description => :description,
56
+ :copyright => :copyright,
57
+ :authors => :managingEditor,
58
+ :last_updated => [:lastBuildDate, :pubDate, :dc_date],
59
+ :id => :guid,
60
+ :ttl => :ttl
61
+ }
62
+
63
+ # make two passes, to catch all possible root elements
64
+ map_functions!(feed_mapping, rss, feed)
65
+ map_functions!(feed_mapping, rss.channel, feed)
66
+
67
+ # custom channel elements
68
+ feed.image = rss.image ? rss.image.url : nil
69
+ feed.skip_hours = skip(rss, :skipHours)
70
+ feed.skip_days = skip(rss, :skipDays)
71
+
72
+ # item elements
73
+ item_mapping = {
74
+ :date_published => [:pubDate, :dc_date],
75
+ :urls => :link,
76
+ :enclosures => :enclosure,
77
+ :description => :description,
78
+ :content => [:content_encoded, :description],
79
+ :title => :title,
80
+ :authors => [:author, :dc_creator],
81
+ :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
82
+ }
83
+
84
+ rss.items.each do |rss_item|
85
+ unless rss_item.title.nil? && rss_item.description.nil? # some feeds return empty items
86
+ feed_entry = Entry.new
87
+ map_functions!(item_mapping, rss_item, feed_entry)
88
+
89
+ # custom item elements
90
+ feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
91
+ # fall back to link for ID
92
+ feed_entry.id ||= rss_item.link if rss_item.respond_to?(:link) && rss_item.link
93
+ feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
94
+ feed_entry.categories = loose ?
95
+ rss_item.categories.collect{|c|c.content} :
96
+ [rss_item.categories.first.content] rescue []
97
+
98
+ feed.entries << feed_entry
99
+ end
100
+ end
101
+
102
+ feed
103
+ end
104
+
105
+ def self.skip(parser, attribute)
106
+ case attribute
107
+ when :skipHours then attributes = :hours
108
+ when :skipDays then attributes = :days
109
+ end
110
+ channel = parser.channel
111
+
112
+ return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
113
+ a.send(attributes).collect{|e| e.content}
114
+ end
115
+
116
+ end
117
+ end
@@ -0,0 +1,142 @@
1
+ require 'simple-rss'
2
+
3
+ # Monkey patches for outstanding issues logged in the simple-rss project.
4
+ # * Add support for issued time field:
5
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
6
+ # * The '+' symbol is lost when escaping fields.
7
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
8
+ #
9
+ class SimpleRSS
10
+ @@item_tags << :issued
11
+
12
+ undef clean_content
13
+ def clean_content(tag, attrs, content)
14
+ content = content.to_s
15
+ case tag
16
+ when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
17
+ Time.parse(content) rescue unescape(content)
18
+ when :author, :contributor, :skipHours, :skipDays
19
+ unescape(content.gsub(/<.*?>/,''))
20
+ else
21
+ content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
22
+ end
23
+ end
24
+
25
+ undef unescape
26
+ def unescape(s)
27
+ if s =~ /^\s*(<!\[CDATA\[|\]\]>)/
28
+ # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
29
+ s.gsub(/(<!\[CDATA\[|\]\]>)/,'')
30
+ elsif s =~ /[<>]/
31
+ # Already looks like HTML.
32
+ s
33
+ else
34
+ # Make it HTML.
35
+ FeedNormalizer::HtmlCleaner.unescapeHTML(s)
36
+ end
37
+ end
38
+ end
39
+
40
+ module FeedNormalizer
41
+
42
+ # The SimpleRSS parser can handle both RSS and Atom feeds.
43
+ class SimpleRssParser < Parser
44
+
45
+ def self.parser
46
+ SimpleRSS
47
+ end
48
+
49
+ def self.parse(xml, loose)
50
+ begin
51
+ atomrss = parser.parse(xml)
52
+ rescue Exception => e
53
+ #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
54
+ return nil
55
+ end
56
+
57
+ package(atomrss)
58
+ end
59
+
60
+ # Fairly low priority; a slower, liberal parser.
61
+ def self.priority
62
+ 900
63
+ end
64
+
65
+ protected
66
+
67
+ def self.package(atomrss)
68
+ feed = Feed.new(self)
69
+
70
+ # root elements
71
+ feed_mapping = {
72
+ :generator => :generator,
73
+ :title => :title,
74
+ :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
75
+ :copyright => [:copyright, :rights],
76
+ :authors => [:author, :webMaster, :managingEditor, :contributor],
77
+ :urls => [:'link+alternate', :link],
78
+ :description => [:description, :subtitle],
79
+ :ttl => :ttl
80
+ }
81
+
82
+ map_functions!(feed_mapping, atomrss, feed)
83
+
84
+ # custom channel elements
85
+ feed.id = feed_id(atomrss)
86
+ feed.image = image(atomrss)
87
+
88
+
89
+ # entry elements
90
+ entry_mapping = {
91
+ :date_published => [:pubDate, :published, :dc_date, :issued],
92
+ :urls => [:'link+alternate', :link],
93
+ :enclosures => :enclosure,
94
+ :description => [:description, :summary],
95
+ :content => [:content, :content_encoded, :description],
96
+ :title => :title,
97
+ :authors => [:author, :contributor, :dc_creator],
98
+ :categories => :category,
99
+ :last_updated => [:updated, :dc_date, :pubDate]
100
+ }
101
+
102
+ atomrss.entries.each do |atomrss_entry|
103
+ unless atomrss_entry.title.nil? && atomrss_entry.description.nil? # some feeds return empty items
104
+ feed_entry = Entry.new
105
+ map_functions!(entry_mapping, atomrss_entry, feed_entry)
106
+
107
+ # custom entry elements
108
+ feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
109
+ # fall back to link for ID
110
+ feed_entry.id ||= atomrss_entry.link if atomrss_entry.respond_to?(:link) && atomrss_entry.link
111
+ feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
112
+
113
+ feed.entries << feed_entry
114
+ end
115
+ end
116
+
117
+ feed
118
+ end
119
+
120
+ def self.image(parser)
121
+ if parser.respond_to?(:image) && parser.image
122
+ if parser.image =~ /<url>/ # RSS image contains an <url> spec
123
+ parser.image.scan(/<url>(.*?)<\/url>/).to_s
124
+ else
125
+ parser.image # Atom contains just the url
126
+ end
127
+ elsif parser.respond_to?(:logo) && parser.logo
128
+ parser.logo
129
+ end
130
+ end
131
+
132
+ def self.feed_id(parser)
133
+ overridden_value(parser, :id) || ("#{parser.link}" if parser.respond_to?(:link))
134
+ end
135
+
136
+ # gets the value returned from the method if it overriden, otherwise nil.
137
+ def self.overridden_value(object, method)
138
+ object.class.public_instance_methods(false).include? method
139
+ end
140
+
141
+ end
142
+ end
data/lib/structures.rb ADDED
@@ -0,0 +1,262 @@
1
+
2
+ module FeedNormalizer
3
+
4
+ module Singular
5
+
6
+ # If the method being called is a singular (in this simple case, does not
7
+ # end with an 's'), then it calls the plural method, and calls the first
8
+ # element. We're assuming that plural methods provide an array.
9
+ #
10
+ # Example:
11
+ # Object contains an array called 'alphas', which looks like [:a, :b, :c].
12
+ # Call object.alpha and :a is returned.
13
+ def method_missing(name, *args)
14
+ plural_name = :"#{name}s"
15
+ return self.send(plural_name).first if respond_to?(plural_name)
16
+ super(name, *args)
17
+ end
18
+
19
+ def respond_to?(x, y=false)
20
+ self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
21
+ end
22
+
23
+ end
24
+
25
+ module ElementEquality
26
+
27
+ def eql?(other)
28
+ self == (other)
29
+ end
30
+
31
+ def ==(other)
32
+ other.equal?(self) ||
33
+ (other.instance_of?(self.class) &&
34
+ self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
35
+ end
36
+
37
+ # Returns the difference between two Feed instances as a hash.
38
+ # Any top-level differences in the Feed object as presented as:
39
+ #
40
+ # { :title => [content, other_content] }
41
+ #
42
+ # For differences at the items level, an array of hashes shows the diffs
43
+ # on a per-entry basis. Only entries that differ will contain a hash:
44
+ #
45
+ # { :items => [
46
+ # {:title => ["An article tile", "A new article title"]},
47
+ # {:title => ["one title", "a different title"]} ]}
48
+ #
49
+ # If the number of items in each feed are different, then the count of each
50
+ # is provided instead:
51
+ #
52
+ # { :items => [4,5] }
53
+ #
54
+ # This method can also be useful for human-readable feed comparison if
55
+ # its output is dumped to YAML.
56
+ def diff(other, elements = self.class::ELEMENTS)
57
+ diffs = {}
58
+
59
+ elements.each do |element|
60
+ if other.respond_to?(element)
61
+ self_value = self.send(element)
62
+ other_value = other.send(element)
63
+
64
+ next if self_value == other_value
65
+
66
+ diffs[element] = if other_value.respond_to?(:diff)
67
+ self_value.diff(other_value)
68
+
69
+ elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
70
+
71
+ if self_value.size != other_value.size
72
+ [self_value.size, other_value.size]
73
+ else
74
+ enum_diffs = []
75
+ self_value.each_with_index do |val, index|
76
+ enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
77
+ end
78
+ enum_diffs.reject{|h| h.empty?}
79
+ end
80
+
81
+ else
82
+ [other_value, self_value] unless other_value == self_value
83
+ end
84
+ end
85
+ end
86
+
87
+ diffs
88
+ end
89
+
90
+ end
91
+
92
+ module ElementCleaner
93
+ # Recursively cleans all elements in place.
94
+ #
95
+ # Only allow tags in whitelist. Always parse the html with a parser and delete
96
+ # all tags that arent on the list.
97
+ #
98
+ # For feed elements that can contain HTML:
99
+ # - feed.(title|description)
100
+ # - feed.entries[n].(title|description|content)
101
+ #
102
+ def clean!
103
+ self.class::SIMPLE_ELEMENTS.each do |element|
104
+ val = self.send(element)
105
+
106
+ send("#{element}=", (val.is_a?(Array) ?
107
+ val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
108
+ end
109
+
110
+ self.class::HTML_ELEMENTS.each do |element|
111
+ send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
112
+ end
113
+
114
+ self.class::BLENDED_ELEMENTS.each do |element|
115
+ self.send(element).collect{|v| v.clean!}
116
+ end
117
+ end
118
+ end
119
+
120
+ module TimeFix
121
+ # Reparse any Time instances, due to RSS::Parser's redefinition of
122
+ # certain aspects of the Time class that creates unexpected behaviour
123
+ # when extending the Time class, as some common third party libraries do.
124
+ # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
125
+ def reparse(obj)
126
+ @parsed ||= false
127
+
128
+ if obj.is_a?(String)
129
+ @parsed = true
130
+ begin
131
+ Time.at(obj) rescue Time.rfc2822(obj) rescue Time.parse(obj)
132
+ rescue
133
+ @parsed = false
134
+ obj
135
+ end
136
+ else
137
+ return obj if @parsed
138
+
139
+ if obj.is_a?(Time)
140
+ @parsed = true
141
+ Time.at(obj) rescue obj
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ module RewriteRelativeLinks
148
+ def rewrite_relative_links(text, url)
149
+ if host = url_host(url)
150
+ text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
151
+ else
152
+ text
153
+ end
154
+ end
155
+
156
+ private
157
+ def url_host(url)
158
+ URI.parse(url).host rescue nil
159
+ end
160
+ end
161
+
162
+
163
+ # Represents a feed item entry.
164
+ # Available fields are:
165
+ # * content
166
+ # * description
167
+ # * title
168
+ # * date_published
169
+ # * urls / url
170
+ # * id
171
+ # * authors / author
172
+ # * copyright
173
+ # * categories
174
+ class Entry
175
+ include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
176
+
177
+ HTML_ELEMENTS = [:content, :description, :title]
178
+ SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated, :enclosures]
179
+ BLENDED_ELEMENTS = []
180
+
181
+ ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
182
+
183
+ attr_accessor(*ELEMENTS)
184
+
185
+ def initialize
186
+ @urls = []
187
+ @authors = []
188
+ @categories = []
189
+ @enclosures = []
190
+ @date_published, @content, @last_updated = nil
191
+ end
192
+
193
+ undef date_published
194
+ def date_published
195
+ @date_published = reparse(@date_published)
196
+ end
197
+
198
+ undef last_updated
199
+ def last_updated
200
+ @last_updated = reparse(@last_updated)
201
+ end
202
+
203
+ undef content
204
+ def content
205
+ @content = rewrite_relative_links(@content, url)
206
+ end
207
+
208
+ end
209
+
210
+ # Represents the root element of a feed.
211
+ # Available fields are:
212
+ # * title
213
+ # * description
214
+ # * id
215
+ # * last_updated
216
+ # * copyright
217
+ # * authors / author
218
+ # * urls / url
219
+ # * image
220
+ # * generator
221
+ # * items / channel
222
+ class Feed
223
+ include Singular, ElementEquality, ElementCleaner, TimeFix
224
+
225
+ # Elements that can contain HTML fragments.
226
+ HTML_ELEMENTS = [:title, :description]
227
+
228
+ # Elements that contain 'plain' Strings, with HTML escaped.
229
+ SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
230
+
231
+ # Elements that contain both HTML and escaped HTML.
232
+ BLENDED_ELEMENTS = [:items]
233
+
234
+ ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
235
+
236
+ attr_accessor(*ELEMENTS)
237
+ attr_accessor(:parser)
238
+
239
+ alias :entries :items
240
+
241
+ def initialize(wrapper)
242
+ # set up associations (i.e. arrays where needed)
243
+ @urls = []
244
+ @authors = []
245
+ @skip_hours = []
246
+ @skip_days = []
247
+ @items = []
248
+ @parser = wrapper.parser.to_s
249
+ @last_updated = nil
250
+ end
251
+
252
+ undef last_updated
253
+ def last_updated
254
+ @last_updated = reparse(@last_updated)
255
+ end
256
+
257
+ def channel() self end
258
+
259
+ end
260
+
261
+ end
262
+