feed-normalizer 1.5.1 → 1.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,190 +1,181 @@
1
- require 'rubygems'
2
- require 'hpricot'
3
- require 'cgi'
4
-
5
- module FeedNormalizer
6
-
7
- # Various methods for cleaning up HTML and preparing it for safe public
8
- # consumption.
9
- #
10
- # Documents used for refrence:
11
- # - http://www.w3.org/TR/html4/index/attributes.html
12
- # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
13
- # - http://feedparser.org/docs/html-sanitization.html
14
- # - http://code.whytheluckystiff.net/hpricot/wiki
15
- class HtmlCleaner
16
-
17
- # allowed html elements.
18
- HTML_ELEMENTS = %w(
19
- a abbr acronym address area b bdo big blockquote br button caption center
20
- cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
21
- h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
22
- samp small span strike strong sub sup table tbody td tfoot th thead tr tt
23
- u ul var
24
- )
25
-
26
- # allowed attributes.
27
- HTML_ATTRS = %w(
28
- abbr accept accept-charset accesskey align alt axis border cellpadding
29
- cellspacing char charoff charset checked cite class clear cols colspan
30
- color compact coords datetime dir disabled for frame headers height href
31
- hreflang hspace id ismap label lang longdesc maxlength media method
32
- multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
33
- scope selected shape size span src start summary tabindex target title
34
- type usemap valign value vspace width
35
- )
36
-
37
- # allowed attributes, but they can contain URIs, extra caution required.
38
- # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
39
- HTML_URI_ATTRS = %w(
40
- href src cite usemap longdesc
41
- )
42
-
43
- DODGY_URI_SCHEMES = %w(
44
- javascript vbscript mocha livescript data
45
- )
46
-
47
- class << self
48
-
49
- # Does this:
50
- # - Unescape HTML
51
- # - Parse HTML into tree
52
- # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
53
- # - Each tag:
54
- # - remove tag if not whitelisted
55
- # - escape HTML tag contents
56
- # - remove all attributes not on whitelist
57
- # - extra-scrub URI attrs; see dodgy_uri?
58
- #
59
- # Extra (i.e. unmatched) ending tags and comments are removed.
60
- def clean(str)
61
- str = unescapeHTML(str)
62
-
63
- doc = Hpricot(str, :fixup_tags => true)
64
- doc = subtree(doc, :body)
65
-
66
- # get all the tags in the document
67
- # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
- # including text nodes instead of just tagged elements.
69
- tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
-
71
- # Remove tags that aren't whitelisted.
72
- remove_tags!(doc, tags - HTML_ELEMENTS)
73
- remaining_tags = tags & HTML_ELEMENTS
74
-
75
- # Remove attributes that aren't on the whitelist, or are suspicious URLs.
76
- (doc/remaining_tags.join(",")).each do |element|
77
- element.raw_attributes.reject! do |attr,val|
78
- !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
79
- end
80
-
81
- element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
82
- end unless remaining_tags.empty?
83
-
84
- doc.traverse_text {|t| t.set(add_entities(t.to_html))}
85
-
86
- # Return the tree, without comments. Ugly way of removing comments,
87
- # but can't see a way to do this in Hpricot yet.
88
- doc.to_s.gsub(/<\!--.*?-->/mi, '')
89
- end
90
-
91
- # For all other feed elements:
92
- # - Unescape HTML.
93
- # - Parse HTML into tree (taking 'body' as root, if present)
94
- # - Takes text out of each tag, and escapes HTML.
95
- # - Returns all text concatenated.
96
- def flatten(str)
97
- str.gsub!("\n", " ")
98
- str = unescapeHTML(str)
99
-
100
- doc = Hpricot(str, :xhtml_strict => true)
101
- doc = subtree(doc, :body)
102
-
103
- out = []
104
- doc.traverse_text {|t| out << add_entities(t.to_html)}
105
-
106
- return out.join
107
- end
108
-
109
- # Returns true if the given string contains a suspicious URL,
110
- # i.e. a javascript link.
111
- #
112
- # This method rejects javascript, vbscript, livescript, mocha and data URLs.
113
- # It *could* be refined to only deny dangerous data URLs, however.
114
- def dodgy_uri?(uri)
115
- uri = uri.to_s
116
-
117
- # special case for poorly-formed entities (missing ';')
118
- # if these occur *anywhere* within the string, then throw it out.
119
- return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
120
-
121
- # Try escaping as both HTML or URI encodings, and then trying
122
- # each scheme regexp on each
123
- [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
124
- DODGY_URI_SCHEMES.each do |scheme|
125
-
126
- regexp = "#{scheme}:".gsub(/./) do |char|
127
- "([\000-\037\177\s]*)#{char}"
128
- end
129
-
130
- # regexp looks something like
131
- # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
132
- return true if (unesc_uri =~ %r{\A#{regexp}}mi)
133
- end
134
- end
135
-
136
- nil
137
- end
138
-
139
- # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
140
- def unescapeHTML(str, xml = true)
141
- CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
142
- end
143
-
144
- # Adds entities where possible.
145
- # Works like CGI.escapeHTML, but will not escape existing entities;
146
- # i.e. &#123; will NOT become &amp;#123;
147
- #
148
- # This method could be improved by adding a whitelist of html entities.
149
- def add_entities(str)
150
- str.to_s.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
151
- end
152
-
153
- private
154
-
155
- # Everything below elment, or the just return the doc if element not present.
156
- def subtree(doc, element)
157
- doc.at("//#{element}/*") || doc
158
- end
159
-
160
- def remove_tags!(doc, tags)
161
- (doc/tags.join(",")).remove unless tags.empty?
162
- end
163
-
164
- end
165
- end
166
- end
167
-
168
-
169
- module Enumerable #:nodoc:
170
- def build_hash
171
- result = {}
172
- self.each do |elt|
173
- key, value = yield elt
174
- result[key] = value
175
- end
176
- result
177
- end
178
- end
179
-
180
- # http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
181
- # Subject: A simple Hpricot text setter
182
- # From: Chris Gehlker <canyonrat mac.com>
183
- # Date: Fri, 11 Aug 2006 03:19:13 +0900
184
- class Hpricot::Text #:nodoc:
185
- def set(string)
186
- @content = string
187
- self.raw_string = string
188
- end
189
- end
190
-
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'cgi'
4
+
5
+ module FeedNormalizer
6
+
7
+ # Various methods for cleaning up HTML and preparing it for safe public
8
+ # consumption.
9
+ #
10
+ # Documents used for refrence:
11
+ # - http://www.w3.org/TR/html4/index/attributes.html
12
+ # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
13
+ # - http://feedparser.org/docs/html-sanitization.html
14
+ # - http://code.whytheluckystiff.net/hpricot/wiki
15
+ class HtmlCleaner
16
+
17
+ # allowed html elements.
18
+ HTML_ELEMENTS = %w(
19
+ a abbr acronym address area b bdo big blockquote br button caption center
20
+ cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
21
+ h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
22
+ samp small span strike strong sub sup table tbody td tfoot th thead tr tt
23
+ u ul var
24
+ )
25
+
26
+ # allowed attributes.
27
+ HTML_ATTRS = %w(
28
+ abbr accept accept-charset accesskey align alt axis border cellpadding
29
+ cellspacing char charoff charset checked cite class clear cols colspan
30
+ color compact coords datetime dir disabled for frame headers height href
31
+ hreflang hspace id ismap label lang longdesc maxlength media method
32
+ multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
33
+ scope selected shape size span src start summary tabindex target title
34
+ type usemap valign value vspace width
35
+ )
36
+
37
+ # allowed attributes, but they can contain URIs, extra caution required.
38
+ # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
39
+ HTML_URI_ATTRS = %w(
40
+ href src cite usemap longdesc
41
+ )
42
+
43
+ DODGY_URI_SCHEMES = %w(
44
+ javascript vbscript mocha livescript data
45
+ )
46
+
47
+ class << self
48
+
49
+ # Does this:
50
+ # - Unescape HTML
51
+ # - Parse HTML into tree
52
+ # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
53
+ # - Each tag:
54
+ # - remove tag if not whitelisted
55
+ # - escape HTML tag contents
56
+ # - remove all attributes not on whitelist
57
+ # - extra-scrub URI attrs; see dodgy_uri?
58
+ #
59
+ # Extra (i.e. unmatched) ending tags and comments are removed.
60
+ def clean(str)
61
+ str = unescapeHTML(str)
62
+
63
+ doc = Hpricot(str, :fixup_tags => true)
64
+ doc = subtree(doc, :body)
65
+
66
+ # get all the tags in the document
67
+ # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
+ # including text nodes instead of just tagged elements.
69
+ tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
+
71
+ # Remove tags that aren't whitelisted.
72
+ remove_tags!(doc, tags - HTML_ELEMENTS)
73
+ remaining_tags = tags & HTML_ELEMENTS
74
+
75
+ # Remove attributes that aren't on the whitelist, or are suspicious URLs.
76
+ (doc/remaining_tags.join(",")).each do |element|
77
+ next if element.raw_attributes.nil? || element.raw_attributes.empty?
78
+ element.raw_attributes.reject! do |attr,val|
79
+ !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
80
+ end
81
+
82
+ element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
83
+ end unless remaining_tags.empty?
84
+
85
+ doc.traverse_text do |t|
86
+ t.swap(add_entities(t.to_html))
87
+ end
88
+
89
+ # Return the tree, without comments. Ugly way of removing comments,
90
+ # but can't see a way to do this in Hpricot yet.
91
+ doc.to_s.gsub(/<\!--.*?-->/mi, '')
92
+ end
93
+
94
+ # For all other feed elements:
95
+ # - Unescape HTML.
96
+ # - Parse HTML into tree (taking 'body' as root, if present)
97
+ # - Takes text out of each tag, and escapes HTML.
98
+ # - Returns all text concatenated.
99
+ def flatten(str)
100
+ str.gsub!("\n", " ")
101
+ str = unescapeHTML(str)
102
+
103
+ doc = Hpricot(str, :xhtml_strict => true)
104
+ doc = subtree(doc, :body)
105
+
106
+ out = []
107
+ doc.traverse_text {|t| out << add_entities(t.to_html)}
108
+
109
+ return out.join
110
+ end
111
+
112
+ # Returns true if the given string contains a suspicious URL,
113
+ # i.e. a javascript link.
114
+ #
115
+ # This method rejects javascript, vbscript, livescript, mocha and data URLs.
116
+ # It *could* be refined to only deny dangerous data URLs, however.
117
+ def dodgy_uri?(uri)
118
+ uri = uri.to_s
119
+
120
+ # special case for poorly-formed entities (missing ';')
121
+ # if these occur *anywhere* within the string, then throw it out.
122
+ return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
123
+
124
+ # Try escaping as both HTML or URI encodings, and then trying
125
+ # each scheme regexp on each
126
+ [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
127
+ DODGY_URI_SCHEMES.each do |scheme|
128
+
129
+ regexp = "#{scheme}:".gsub(/./) do |char|
130
+ "([\000-\037\177\s]*)#{char}"
131
+ end
132
+
133
+ # regexp looks something like
134
+ # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
135
+ return true if (unesc_uri =~ %r{\A#{regexp}}mi)
136
+ end
137
+ end
138
+
139
+ nil
140
+ end
141
+
142
+ # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
143
+ def unescapeHTML(str, xml = true)
144
+ CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
145
+ end
146
+
147
+ # Adds entities where possible.
148
+ # Works like CGI.escapeHTML, but will not escape existing entities;
149
+ # i.e. &#123; will NOT become &amp;#123;
150
+ #
151
+ # This method could be improved by adding a whitelist of html entities.
152
+ def add_entities(str)
153
+ str.to_s.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
154
+ end
155
+
156
+ private
157
+
158
+ # Everything below elment, or the just return the doc if element not present.
159
+ def subtree(doc, element)
160
+ doc.at("//#{element}/*") || doc
161
+ end
162
+
163
+ def remove_tags!(doc, tags)
164
+ (doc/tags.join(",")).remove unless tags.empty?
165
+ end
166
+
167
+ end
168
+ end
169
+ end
170
+
171
+
172
+ module Enumerable #:nodoc:
173
+ def build_hash
174
+ result = {}
175
+ self.each do |elt|
176
+ key, value = yield elt
177
+ result[key] = value
178
+ end
179
+ result
180
+ end
181
+ end
@@ -1,98 +1,113 @@
1
- require 'rss'
2
-
3
- # For some reason, this is only included in the RDF Item by default.
4
- class RSS::Rss::Channel::Item # :nodoc:
5
- include RSS::ContentModel
6
- end
7
-
8
- module FeedNormalizer
9
- class RubyRssParser < Parser
10
-
11
- def self.parser
12
- RSS::Parser
13
- end
14
-
15
- def self.parse(xml, loose)
16
- begin
17
- rss = parser.parse(xml)
18
- rescue Exception => e
19
- #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
20
- return nil
21
- end
22
-
23
- rss ? package(rss, loose) : nil
24
- end
25
-
26
- # Fairly high priority; a fast and strict parser.
27
- def self.priority
28
- 100
29
- end
30
-
31
- protected
32
-
33
- def self.package(rss, loose)
34
- feed = Feed.new(self)
35
-
36
- # channel elements
37
- feed_mapping = {
38
- :generator => :generator,
39
- :title => :title,
40
- :urls => :link,
41
- :description => :description,
42
- :copyright => :copyright,
43
- :authors => :managingEditor,
44
- :last_updated => [:lastBuildDate, :pubDate, :dc_date],
45
- :id => :guid,
46
- :ttl => :ttl
47
- }
48
-
49
- # make two passes, to catch all possible root elements
50
- map_functions!(feed_mapping, rss, feed)
51
- map_functions!(feed_mapping, rss.channel, feed)
52
-
53
- # custom channel elements
54
- feed.image = rss.image ? rss.image.url : nil
55
- feed.skip_hours = skip(rss, :skipHours)
56
- feed.skip_days = skip(rss, :skipDays)
57
-
58
- # item elements
59
- item_mapping = {
60
- :date_published => [:pubDate, :dc_date],
61
- :urls => :link,
62
- :description => :description,
63
- :content => [:content_encoded, :description],
64
- :title => :title,
65
- :authors => [:author, :dc_creator],
66
- :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
67
- }
68
-
69
- rss.items.each do |rss_item|
70
- feed_entry = Entry.new
71
- map_functions!(item_mapping, rss_item, feed_entry)
72
-
73
- # custom item elements
74
- feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
75
- feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
76
- feed_entry.categories = loose ?
77
- rss_item.categories.collect{|c|c.content} :
78
- [rss_item.categories.first.content] rescue []
79
-
80
- feed.entries << feed_entry
81
- end
82
-
83
- feed
84
- end
85
-
86
- def self.skip(parser, attribute)
87
- attributes = case attribute
88
- when :skipHours: :hours
89
- when :skipDays: :days
90
- end
1
+ require 'rss'
2
+
3
+ # For some reason, this is only included in the RDF Item by default (in 0.1.6).
4
+ unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
5
+ class RSS::Rss::Channel::Item # :nodoc:
6
+ include RSS::ContentModel
7
+ end
8
+ end
9
+
10
+ # Add equality onto Enclosures.
11
+ class RSS::Rss::Channel::Item::Enclosure
12
+ def eql?(enc)
13
+ instance_variables.all? do |iv|
14
+ instance_variable_get(iv) == enc.instance_variable_get(iv)
15
+ end
16
+ end
17
+
18
+ alias == eql?
19
+ end
20
+
21
+ module FeedNormalizer
22
+ class RubyRssParser < Parser
23
+
24
+ def self.parser
25
+ RSS::Parser
26
+ end
27
+
28
+ def self.parse(xml, loose)
29
+ begin
30
+ rss = parser.parse(xml)
31
+ rescue Exception => e
32
+ #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
33
+ return nil
34
+ end
35
+
36
+ # check for channel to make sure we're only dealing with RSS.
37
+ rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
38
+ end
39
+
40
+ # Fairly high priority; a fast and strict parser.
41
+ def self.priority
42
+ 100
43
+ end
44
+
45
+ protected
46
+
47
+ def self.package(rss, loose)
48
+ feed = Feed.new(self)
49
+
50
+ # channel elements
51
+ feed_mapping = {
52
+ :generator => :generator,
53
+ :title => :title,
54
+ :urls => :link,
55
+ :description => :description,
56
+ :copyright => :copyright,
57
+ :authors => :managingEditor,
58
+ :last_updated => [:lastBuildDate, :pubDate, :dc_date],
59
+ :id => :guid,
60
+ :ttl => :ttl
61
+ }
62
+
63
+ # make two passes, to catch all possible root elements
64
+ map_functions!(feed_mapping, rss, feed)
65
+ map_functions!(feed_mapping, rss.channel, feed)
66
+
67
+ # custom channel elements
68
+ feed.image = rss.image ? rss.image.url : nil
69
+ feed.skip_hours = skip(rss, :skipHours)
70
+ feed.skip_days = skip(rss, :skipDays)
71
+
72
+ # item elements
73
+ item_mapping = {
74
+ :date_published => [:pubDate, :dc_date],
75
+ :urls => :link,
76
+ :enclosures => :enclosure,
77
+ :description => :description,
78
+ :content => [:content_encoded, :description],
79
+ :title => :title,
80
+ :authors => [:author, :dc_creator],
81
+ :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
82
+ }
83
+
84
+ rss.items.each do |rss_item|
85
+ feed_entry = Entry.new
86
+ map_functions!(item_mapping, rss_item, feed_entry)
87
+
88
+ # custom item elements
89
+ feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
90
+ feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
91
+ feed_entry.categories = loose ?
92
+ rss_item.categories.collect{|c|c.content} :
93
+ [rss_item.categories.first.content] rescue []
94
+
95
+ feed.entries << feed_entry
96
+ end
97
+
98
+ feed
99
+ end
100
+
101
+ def self.skip(parser, attribute)
102
+ case attribute
103
+ when :skipHours then attributes = :hours
104
+ when :skipDays then attributes = :days
105
+ end
91
106
  channel = parser.channel
92
107
 
93
108
  return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
94
- a.send(attributes).collect{|e| e.content}
95
- end
96
-
97
- end
98
- end
109
+ a.send(attributes).collect{|e| e.content}
110
+ end
111
+
112
+ end
113
+ end