feed-normalizer 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,190 +1,181 @@
1
- require 'rubygems'
2
- require 'hpricot'
3
- require 'cgi'
4
-
5
- module FeedNormalizer
6
-
7
- # Various methods for cleaning up HTML and preparing it for safe public
8
- # consumption.
9
- #
10
- # Documents used for refrence:
11
- # - http://www.w3.org/TR/html4/index/attributes.html
12
- # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
13
- # - http://feedparser.org/docs/html-sanitization.html
14
- # - http://code.whytheluckystiff.net/hpricot/wiki
15
- class HtmlCleaner
16
-
17
- # allowed html elements.
18
- HTML_ELEMENTS = %w(
19
- a abbr acronym address area b bdo big blockquote br button caption center
20
- cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
21
- h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
22
- samp small span strike strong sub sup table tbody td tfoot th thead tr tt
23
- u ul var
24
- )
25
-
26
- # allowed attributes.
27
- HTML_ATTRS = %w(
28
- abbr accept accept-charset accesskey align alt axis border cellpadding
29
- cellspacing char charoff charset checked cite class clear cols colspan
30
- color compact coords datetime dir disabled for frame headers height href
31
- hreflang hspace id ismap label lang longdesc maxlength media method
32
- multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
33
- scope selected shape size span src start summary tabindex target title
34
- type usemap valign value vspace width
35
- )
36
-
37
- # allowed attributes, but they can contain URIs, extra caution required.
38
- # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
39
- HTML_URI_ATTRS = %w(
40
- href src cite usemap longdesc
41
- )
42
-
43
- DODGY_URI_SCHEMES = %w(
44
- javascript vbscript mocha livescript data
45
- )
46
-
47
- class << self
48
-
49
- # Does this:
50
- # - Unescape HTML
51
- # - Parse HTML into tree
52
- # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
53
- # - Each tag:
54
- # - remove tag if not whitelisted
55
- # - escape HTML tag contents
56
- # - remove all attributes not on whitelist
57
- # - extra-scrub URI attrs; see dodgy_uri?
58
- #
59
- # Extra (i.e. unmatched) ending tags and comments are removed.
60
- def clean(str)
61
- str = unescapeHTML(str)
62
-
63
- doc = Hpricot(str, :fixup_tags => true)
64
- doc = subtree(doc, :body)
65
-
66
- # get all the tags in the document
67
- # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
- # including text nodes instead of just tagged elements.
69
- tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
-
71
- # Remove tags that aren't whitelisted.
72
- remove_tags!(doc, tags - HTML_ELEMENTS)
73
- remaining_tags = tags & HTML_ELEMENTS
74
-
75
- # Remove attributes that aren't on the whitelist, or are suspicious URLs.
76
- (doc/remaining_tags.join(",")).each do |element|
77
- element.raw_attributes.reject! do |attr,val|
78
- !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
79
- end
80
-
81
- element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
82
- end unless remaining_tags.empty?
83
-
84
- doc.traverse_text {|t| t.set(add_entities(t.to_html))}
85
-
86
- # Return the tree, without comments. Ugly way of removing comments,
87
- # but can't see a way to do this in Hpricot yet.
88
- doc.to_s.gsub(/<\!--.*?-->/mi, '')
89
- end
90
-
91
- # For all other feed elements:
92
- # - Unescape HTML.
93
- # - Parse HTML into tree (taking 'body' as root, if present)
94
- # - Takes text out of each tag, and escapes HTML.
95
- # - Returns all text concatenated.
96
- def flatten(str)
97
- str.gsub!("\n", " ")
98
- str = unescapeHTML(str)
99
-
100
- doc = Hpricot(str, :xhtml_strict => true)
101
- doc = subtree(doc, :body)
102
-
103
- out = []
104
- doc.traverse_text {|t| out << add_entities(t.to_html)}
105
-
106
- return out.join
107
- end
108
-
109
- # Returns true if the given string contains a suspicious URL,
110
- # i.e. a javascript link.
111
- #
112
- # This method rejects javascript, vbscript, livescript, mocha and data URLs.
113
- # It *could* be refined to only deny dangerous data URLs, however.
114
- def dodgy_uri?(uri)
115
- uri = uri.to_s
116
-
117
- # special case for poorly-formed entities (missing ';')
118
- # if these occur *anywhere* within the string, then throw it out.
119
- return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
120
-
121
- # Try escaping as both HTML or URI encodings, and then trying
122
- # each scheme regexp on each
123
- [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
124
- DODGY_URI_SCHEMES.each do |scheme|
125
-
126
- regexp = "#{scheme}:".gsub(/./) do |char|
127
- "([\000-\037\177\s]*)#{char}"
128
- end
129
-
130
- # regexp looks something like
131
- # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
132
- return true if (unesc_uri =~ %r{\A#{regexp}}mi)
133
- end
134
- end
135
-
136
- nil
137
- end
138
-
139
- # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
140
- def unescapeHTML(str, xml = true)
141
- CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
142
- end
143
-
144
- # Adds entities where possible.
145
- # Works like CGI.escapeHTML, but will not escape existing entities;
146
- # i.e. &#123; will NOT become &amp;#123;
147
- #
148
- # This method could be improved by adding a whitelist of html entities.
149
- def add_entities(str)
150
- str.to_s.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
151
- end
152
-
153
- private
154
-
155
- # Everything below elment, or the just return the doc if element not present.
156
- def subtree(doc, element)
157
- doc.at("//#{element}/*") || doc
158
- end
159
-
160
- def remove_tags!(doc, tags)
161
- (doc/tags.join(",")).remove unless tags.empty?
162
- end
163
-
164
- end
165
- end
166
- end
167
-
168
-
169
- module Enumerable #:nodoc:
170
- def build_hash
171
- result = {}
172
- self.each do |elt|
173
- key, value = yield elt
174
- result[key] = value
175
- end
176
- result
177
- end
178
- end
179
-
180
- # http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
181
- # Subject: A simple Hpricot text setter
182
- # From: Chris Gehlker <canyonrat mac.com>
183
- # Date: Fri, 11 Aug 2006 03:19:13 +0900
184
- class Hpricot::Text #:nodoc:
185
- def set(string)
186
- @content = string
187
- self.raw_string = string
188
- end
189
- end
190
-
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'cgi'
4
+
5
+ module FeedNormalizer
6
+
7
+ # Various methods for cleaning up HTML and preparing it for safe public
8
+ # consumption.
9
+ #
10
+ # Documents used for refrence:
11
+ # - http://www.w3.org/TR/html4/index/attributes.html
12
+ # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
13
+ # - http://feedparser.org/docs/html-sanitization.html
14
+ # - http://code.whytheluckystiff.net/hpricot/wiki
15
+ class HtmlCleaner
16
+
17
+ # allowed html elements.
18
+ HTML_ELEMENTS = %w(
19
+ a abbr acronym address area b bdo big blockquote br button caption center
20
+ cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
21
+ h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
22
+ samp small span strike strong sub sup table tbody td tfoot th thead tr tt
23
+ u ul var
24
+ )
25
+
26
+ # allowed attributes.
27
+ HTML_ATTRS = %w(
28
+ abbr accept accept-charset accesskey align alt axis border cellpadding
29
+ cellspacing char charoff charset checked cite class clear cols colspan
30
+ color compact coords datetime dir disabled for frame headers height href
31
+ hreflang hspace id ismap label lang longdesc maxlength media method
32
+ multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
33
+ scope selected shape size span src start summary tabindex target title
34
+ type usemap valign value vspace width
35
+ )
36
+
37
+ # allowed attributes, but they can contain URIs, extra caution required.
38
+ # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
39
+ HTML_URI_ATTRS = %w(
40
+ href src cite usemap longdesc
41
+ )
42
+
43
+ DODGY_URI_SCHEMES = %w(
44
+ javascript vbscript mocha livescript data
45
+ )
46
+
47
+ class << self
48
+
49
+ # Does this:
50
+ # - Unescape HTML
51
+ # - Parse HTML into tree
52
+ # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
53
+ # - Each tag:
54
+ # - remove tag if not whitelisted
55
+ # - escape HTML tag contents
56
+ # - remove all attributes not on whitelist
57
+ # - extra-scrub URI attrs; see dodgy_uri?
58
+ #
59
+ # Extra (i.e. unmatched) ending tags and comments are removed.
60
+ def clean(str)
61
+ str = unescapeHTML(str)
62
+
63
+ doc = Hpricot(str, :fixup_tags => true)
64
+ doc = subtree(doc, :body)
65
+
66
+ # get all the tags in the document
67
+ # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
+ # including text nodes instead of just tagged elements.
69
+ tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
+
71
+ # Remove tags that aren't whitelisted.
72
+ remove_tags!(doc, tags - HTML_ELEMENTS)
73
+ remaining_tags = tags & HTML_ELEMENTS
74
+
75
+ # Remove attributes that aren't on the whitelist, or are suspicious URLs.
76
+ (doc/remaining_tags.join(",")).each do |element|
77
+ next if element.raw_attributes.nil? || element.raw_attributes.empty?
78
+ element.raw_attributes.reject! do |attr,val|
79
+ !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
80
+ end
81
+
82
+ element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
83
+ end unless remaining_tags.empty?
84
+
85
+ doc.traverse_text do |t|
86
+ t.swap(add_entities(t.to_html))
87
+ end
88
+
89
+ # Return the tree, without comments. Ugly way of removing comments,
90
+ # but can't see a way to do this in Hpricot yet.
91
+ doc.to_s.gsub(/<\!--.*?-->/mi, '')
92
+ end
93
+
94
+ # For all other feed elements:
95
+ # - Unescape HTML.
96
+ # - Parse HTML into tree (taking 'body' as root, if present)
97
+ # - Takes text out of each tag, and escapes HTML.
98
+ # - Returns all text concatenated.
99
+ def flatten(str)
100
+ str.gsub!("\n", " ")
101
+ str = unescapeHTML(str)
102
+
103
+ doc = Hpricot(str, :xhtml_strict => true)
104
+ doc = subtree(doc, :body)
105
+
106
+ out = []
107
+ doc.traverse_text {|t| out << add_entities(t.to_html)}
108
+
109
+ return out.join
110
+ end
111
+
112
+ # Returns true if the given string contains a suspicious URL,
113
+ # i.e. a javascript link.
114
+ #
115
+ # This method rejects javascript, vbscript, livescript, mocha and data URLs.
116
+ # It *could* be refined to only deny dangerous data URLs, however.
117
+ def dodgy_uri?(uri)
118
+ uri = uri.to_s
119
+
120
+ # special case for poorly-formed entities (missing ';')
121
+ # if these occur *anywhere* within the string, then throw it out.
122
+ return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
123
+
124
+ # Try escaping as both HTML or URI encodings, and then trying
125
+ # each scheme regexp on each
126
+ [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
127
+ DODGY_URI_SCHEMES.each do |scheme|
128
+
129
+ regexp = "#{scheme}:".gsub(/./) do |char|
130
+ "([\000-\037\177\s]*)#{char}"
131
+ end
132
+
133
+ # regexp looks something like
134
+ # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
135
+ return true if (unesc_uri =~ %r{\A#{regexp}}mi)
136
+ end
137
+ end
138
+
139
+ nil
140
+ end
141
+
142
+ # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
143
+ def unescapeHTML(str, xml = true)
144
+ CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
145
+ end
146
+
147
+ # Adds entities where possible.
148
+ # Works like CGI.escapeHTML, but will not escape existing entities;
149
+ # i.e. &#123; will NOT become &amp;#123;
150
+ #
151
+ # This method could be improved by adding a whitelist of html entities.
152
+ def add_entities(str)
153
+ str.to_s.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
154
+ end
155
+
156
+ private
157
+
158
+ # Everything below elment, or the just return the doc if element not present.
159
+ def subtree(doc, element)
160
+ doc.at("//#{element}/*") || doc
161
+ end
162
+
163
+ def remove_tags!(doc, tags)
164
+ (doc/tags.join(",")).remove unless tags.empty?
165
+ end
166
+
167
+ end
168
+ end
169
+ end
170
+
171
+
172
+ module Enumerable #:nodoc:
173
+ def build_hash
174
+ result = {}
175
+ self.each do |elt|
176
+ key, value = yield elt
177
+ result[key] = value
178
+ end
179
+ result
180
+ end
181
+ end
@@ -1,98 +1,113 @@
1
- require 'rss'
2
-
3
- # For some reason, this is only included in the RDF Item by default.
4
- class RSS::Rss::Channel::Item # :nodoc:
5
- include RSS::ContentModel
6
- end
7
-
8
- module FeedNormalizer
9
- class RubyRssParser < Parser
10
-
11
- def self.parser
12
- RSS::Parser
13
- end
14
-
15
- def self.parse(xml, loose)
16
- begin
17
- rss = parser.parse(xml)
18
- rescue Exception => e
19
- #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
20
- return nil
21
- end
22
-
23
- rss ? package(rss, loose) : nil
24
- end
25
-
26
- # Fairly high priority; a fast and strict parser.
27
- def self.priority
28
- 100
29
- end
30
-
31
- protected
32
-
33
- def self.package(rss, loose)
34
- feed = Feed.new(self)
35
-
36
- # channel elements
37
- feed_mapping = {
38
- :generator => :generator,
39
- :title => :title,
40
- :urls => :link,
41
- :description => :description,
42
- :copyright => :copyright,
43
- :authors => :managingEditor,
44
- :last_updated => [:lastBuildDate, :pubDate, :dc_date],
45
- :id => :guid,
46
- :ttl => :ttl
47
- }
48
-
49
- # make two passes, to catch all possible root elements
50
- map_functions!(feed_mapping, rss, feed)
51
- map_functions!(feed_mapping, rss.channel, feed)
52
-
53
- # custom channel elements
54
- feed.image = rss.image ? rss.image.url : nil
55
- feed.skip_hours = skip(rss, :skipHours)
56
- feed.skip_days = skip(rss, :skipDays)
57
-
58
- # item elements
59
- item_mapping = {
60
- :date_published => [:pubDate, :dc_date],
61
- :urls => :link,
62
- :description => :description,
63
- :content => [:content_encoded, :description],
64
- :title => :title,
65
- :authors => [:author, :dc_creator],
66
- :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
67
- }
68
-
69
- rss.items.each do |rss_item|
70
- feed_entry = Entry.new
71
- map_functions!(item_mapping, rss_item, feed_entry)
72
-
73
- # custom item elements
74
- feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
75
- feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
76
- feed_entry.categories = loose ?
77
- rss_item.categories.collect{|c|c.content} :
78
- [rss_item.categories.first.content] rescue []
79
-
80
- feed.entries << feed_entry
81
- end
82
-
83
- feed
84
- end
85
-
86
- def self.skip(parser, attribute)
87
- attributes = case attribute
88
- when :skipHours: :hours
89
- when :skipDays: :days
90
- end
1
+ require 'rss'
2
+
3
+ # For some reason, this is only included in the RDF Item by default (in 0.1.6).
4
+ unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
5
+ class RSS::Rss::Channel::Item # :nodoc:
6
+ include RSS::ContentModel
7
+ end
8
+ end
9
+
10
+ # Add equality onto Enclosures.
11
+ class RSS::Rss::Channel::Item::Enclosure
12
+ def eql?(enc)
13
+ instance_variables.all? do |iv|
14
+ instance_variable_get(iv) == enc.instance_variable_get(iv)
15
+ end
16
+ end
17
+
18
+ alias == eql?
19
+ end
20
+
21
+ module FeedNormalizer
22
+ class RubyRssParser < Parser
23
+
24
+ def self.parser
25
+ RSS::Parser
26
+ end
27
+
28
+ def self.parse(xml, loose)
29
+ begin
30
+ rss = parser.parse(xml)
31
+ rescue Exception => e
32
+ #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
33
+ return nil
34
+ end
35
+
36
+ # check for channel to make sure we're only dealing with RSS.
37
+ rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
38
+ end
39
+
40
+ # Fairly high priority; a fast and strict parser.
41
+ def self.priority
42
+ 100
43
+ end
44
+
45
+ protected
46
+
47
+ def self.package(rss, loose)
48
+ feed = Feed.new(self)
49
+
50
+ # channel elements
51
+ feed_mapping = {
52
+ :generator => :generator,
53
+ :title => :title,
54
+ :urls => :link,
55
+ :description => :description,
56
+ :copyright => :copyright,
57
+ :authors => :managingEditor,
58
+ :last_updated => [:lastBuildDate, :pubDate, :dc_date],
59
+ :id => :guid,
60
+ :ttl => :ttl
61
+ }
62
+
63
+ # make two passes, to catch all possible root elements
64
+ map_functions!(feed_mapping, rss, feed)
65
+ map_functions!(feed_mapping, rss.channel, feed)
66
+
67
+ # custom channel elements
68
+ feed.image = rss.image ? rss.image.url : nil
69
+ feed.skip_hours = skip(rss, :skipHours)
70
+ feed.skip_days = skip(rss, :skipDays)
71
+
72
+ # item elements
73
+ item_mapping = {
74
+ :date_published => [:pubDate, :dc_date],
75
+ :urls => :link,
76
+ :enclosures => :enclosure,
77
+ :description => :description,
78
+ :content => [:content_encoded, :description],
79
+ :title => :title,
80
+ :authors => [:author, :dc_creator],
81
+ :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
82
+ }
83
+
84
+ rss.items.each do |rss_item|
85
+ feed_entry = Entry.new
86
+ map_functions!(item_mapping, rss_item, feed_entry)
87
+
88
+ # custom item elements
89
+ feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
90
+ feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
91
+ feed_entry.categories = loose ?
92
+ rss_item.categories.collect{|c|c.content} :
93
+ [rss_item.categories.first.content] rescue []
94
+
95
+ feed.entries << feed_entry
96
+ end
97
+
98
+ feed
99
+ end
100
+
101
+ def self.skip(parser, attribute)
102
+ case attribute
103
+ when :skipHours then attributes = :hours
104
+ when :skipDays then attributes = :days
105
+ end
91
106
  channel = parser.channel
92
107
 
93
108
  return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
94
- a.send(attributes).collect{|e| e.content}
95
- end
96
-
97
- end
98
- end
109
+ a.send(attributes).collect{|e| e.content}
110
+ end
111
+
112
+ end
113
+ end