feed-normalizer 1.5.1 → 1.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +48 -48
- data/License.txt +27 -27
- data/Manifest.txt +18 -19
- data/README.txt +63 -63
- data/Rakefile +29 -25
- data/lib/feed-normalizer.rb +149 -149
- data/lib/html-cleaner.rb +181 -190
- data/lib/parsers/rss.rb +110 -95
- data/lib/parsers/simple-rss.rb +138 -137
- data/lib/structures.rb +245 -244
- data/test/data/atom03.xml +128 -127
- data/test/data/atom10.xml +114 -112
- data/test/data/rdf10.xml +1498 -1498
- data/test/data/rss20.xml +64 -63
- data/test/data/rss20diff.xml +59 -59
- data/test/data/rss20diff_short.xml +51 -51
- data/test/test_feednormalizer.rb +265 -267
- data/test/test_htmlcleaner.rb +156 -155
- metadata +99 -63
- data/test/test_all.rb +0 -6
data/lib/html-cleaner.rb
CHANGED
@@ -1,190 +1,181 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'hpricot'
|
3
|
-
require 'cgi'
|
4
|
-
|
5
|
-
module FeedNormalizer
|
6
|
-
|
7
|
-
# Various methods for cleaning up HTML and preparing it for safe public
|
8
|
-
# consumption.
|
9
|
-
#
|
10
|
-
# Documents used for refrence:
|
11
|
-
# - http://www.w3.org/TR/html4/index/attributes.html
|
12
|
-
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
13
|
-
# - http://feedparser.org/docs/html-sanitization.html
|
14
|
-
# - http://code.whytheluckystiff.net/hpricot/wiki
|
15
|
-
class HtmlCleaner
|
16
|
-
|
17
|
-
# allowed html elements.
|
18
|
-
HTML_ELEMENTS = %w(
|
19
|
-
a abbr acronym address area b bdo big blockquote br button caption center
|
20
|
-
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
21
|
-
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
22
|
-
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
23
|
-
u ul var
|
24
|
-
)
|
25
|
-
|
26
|
-
# allowed attributes.
|
27
|
-
HTML_ATTRS = %w(
|
28
|
-
abbr accept accept-charset accesskey align alt axis border cellpadding
|
29
|
-
cellspacing char charoff charset checked cite class clear cols colspan
|
30
|
-
color compact coords datetime dir disabled for frame headers height href
|
31
|
-
hreflang hspace id ismap label lang longdesc maxlength media method
|
32
|
-
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
33
|
-
scope selected shape size span src start summary tabindex target title
|
34
|
-
type usemap valign value vspace width
|
35
|
-
)
|
36
|
-
|
37
|
-
# allowed attributes, but they can contain URIs, extra caution required.
|
38
|
-
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
39
|
-
HTML_URI_ATTRS = %w(
|
40
|
-
href src cite usemap longdesc
|
41
|
-
)
|
42
|
-
|
43
|
-
DODGY_URI_SCHEMES = %w(
|
44
|
-
javascript vbscript mocha livescript data
|
45
|
-
)
|
46
|
-
|
47
|
-
class << self
|
48
|
-
|
49
|
-
# Does this:
|
50
|
-
# - Unescape HTML
|
51
|
-
# - Parse HTML into tree
|
52
|
-
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
53
|
-
# - Each tag:
|
54
|
-
# - remove tag if not whitelisted
|
55
|
-
# - escape HTML tag contents
|
56
|
-
# - remove all attributes not on whitelist
|
57
|
-
# - extra-scrub URI attrs; see dodgy_uri?
|
58
|
-
#
|
59
|
-
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
|
-
def clean(str)
|
61
|
-
str = unescapeHTML(str)
|
62
|
-
|
63
|
-
doc = Hpricot(str, :fixup_tags => true)
|
64
|
-
doc = subtree(doc, :body)
|
65
|
-
|
66
|
-
# get all the tags in the document
|
67
|
-
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
|
-
# including text nodes instead of just tagged elements.
|
69
|
-
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
|
-
|
71
|
-
# Remove tags that aren't whitelisted.
|
72
|
-
remove_tags!(doc, tags - HTML_ELEMENTS)
|
73
|
-
remaining_tags = tags & HTML_ELEMENTS
|
74
|
-
|
75
|
-
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
76
|
-
(doc/remaining_tags.join(",")).each do |element|
|
77
|
-
element.raw_attributes.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
#
|
95
|
-
# -
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
doc
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
#
|
113
|
-
#
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
#
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
#
|
148
|
-
#
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
# From: Chris Gehlker <canyonrat mac.com>
|
183
|
-
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
184
|
-
class Hpricot::Text #:nodoc:
|
185
|
-
def set(string)
|
186
|
-
@content = string
|
187
|
-
self.raw_string = string
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
module FeedNormalizer
|
6
|
+
|
7
|
+
# Various methods for cleaning up HTML and preparing it for safe public
|
8
|
+
# consumption.
|
9
|
+
#
|
10
|
+
# Documents used for refrence:
|
11
|
+
# - http://www.w3.org/TR/html4/index/attributes.html
|
12
|
+
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
13
|
+
# - http://feedparser.org/docs/html-sanitization.html
|
14
|
+
# - http://code.whytheluckystiff.net/hpricot/wiki
|
15
|
+
class HtmlCleaner
|
16
|
+
|
17
|
+
# allowed html elements.
|
18
|
+
HTML_ELEMENTS = %w(
|
19
|
+
a abbr acronym address area b bdo big blockquote br button caption center
|
20
|
+
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
21
|
+
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
22
|
+
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
23
|
+
u ul var
|
24
|
+
)
|
25
|
+
|
26
|
+
# allowed attributes.
|
27
|
+
HTML_ATTRS = %w(
|
28
|
+
abbr accept accept-charset accesskey align alt axis border cellpadding
|
29
|
+
cellspacing char charoff charset checked cite class clear cols colspan
|
30
|
+
color compact coords datetime dir disabled for frame headers height href
|
31
|
+
hreflang hspace id ismap label lang longdesc maxlength media method
|
32
|
+
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
33
|
+
scope selected shape size span src start summary tabindex target title
|
34
|
+
type usemap valign value vspace width
|
35
|
+
)
|
36
|
+
|
37
|
+
# allowed attributes, but they can contain URIs, extra caution required.
|
38
|
+
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
39
|
+
HTML_URI_ATTRS = %w(
|
40
|
+
href src cite usemap longdesc
|
41
|
+
)
|
42
|
+
|
43
|
+
DODGY_URI_SCHEMES = %w(
|
44
|
+
javascript vbscript mocha livescript data
|
45
|
+
)
|
46
|
+
|
47
|
+
class << self
|
48
|
+
|
49
|
+
# Does this:
|
50
|
+
# - Unescape HTML
|
51
|
+
# - Parse HTML into tree
|
52
|
+
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
53
|
+
# - Each tag:
|
54
|
+
# - remove tag if not whitelisted
|
55
|
+
# - escape HTML tag contents
|
56
|
+
# - remove all attributes not on whitelist
|
57
|
+
# - extra-scrub URI attrs; see dodgy_uri?
|
58
|
+
#
|
59
|
+
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
|
+
def clean(str)
|
61
|
+
str = unescapeHTML(str)
|
62
|
+
|
63
|
+
doc = Hpricot(str, :fixup_tags => true)
|
64
|
+
doc = subtree(doc, :body)
|
65
|
+
|
66
|
+
# get all the tags in the document
|
67
|
+
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
|
+
# including text nodes instead of just tagged elements.
|
69
|
+
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
|
+
|
71
|
+
# Remove tags that aren't whitelisted.
|
72
|
+
remove_tags!(doc, tags - HTML_ELEMENTS)
|
73
|
+
remaining_tags = tags & HTML_ELEMENTS
|
74
|
+
|
75
|
+
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
76
|
+
(doc/remaining_tags.join(",")).each do |element|
|
77
|
+
next if element.raw_attributes.nil? || element.raw_attributes.empty?
|
78
|
+
element.raw_attributes.reject! do |attr,val|
|
79
|
+
!HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
|
80
|
+
end
|
81
|
+
|
82
|
+
element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
|
83
|
+
end unless remaining_tags.empty?
|
84
|
+
|
85
|
+
doc.traverse_text do |t|
|
86
|
+
t.swap(add_entities(t.to_html))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Return the tree, without comments. Ugly way of removing comments,
|
90
|
+
# but can't see a way to do this in Hpricot yet.
|
91
|
+
doc.to_s.gsub(/<\!--.*?-->/mi, '')
|
92
|
+
end
|
93
|
+
|
94
|
+
# For all other feed elements:
|
95
|
+
# - Unescape HTML.
|
96
|
+
# - Parse HTML into tree (taking 'body' as root, if present)
|
97
|
+
# - Takes text out of each tag, and escapes HTML.
|
98
|
+
# - Returns all text concatenated.
|
99
|
+
def flatten(str)
|
100
|
+
str.gsub!("\n", " ")
|
101
|
+
str = unescapeHTML(str)
|
102
|
+
|
103
|
+
doc = Hpricot(str, :xhtml_strict => true)
|
104
|
+
doc = subtree(doc, :body)
|
105
|
+
|
106
|
+
out = []
|
107
|
+
doc.traverse_text {|t| out << add_entities(t.to_html)}
|
108
|
+
|
109
|
+
return out.join
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns true if the given string contains a suspicious URL,
|
113
|
+
# i.e. a javascript link.
|
114
|
+
#
|
115
|
+
# This method rejects javascript, vbscript, livescript, mocha and data URLs.
|
116
|
+
# It *could* be refined to only deny dangerous data URLs, however.
|
117
|
+
def dodgy_uri?(uri)
|
118
|
+
uri = uri.to_s
|
119
|
+
|
120
|
+
# special case for poorly-formed entities (missing ';')
|
121
|
+
# if these occur *anywhere* within the string, then throw it out.
|
122
|
+
return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
|
123
|
+
|
124
|
+
# Try escaping as both HTML or URI encodings, and then trying
|
125
|
+
# each scheme regexp on each
|
126
|
+
[unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
|
127
|
+
DODGY_URI_SCHEMES.each do |scheme|
|
128
|
+
|
129
|
+
regexp = "#{scheme}:".gsub(/./) do |char|
|
130
|
+
"([\000-\037\177\s]*)#{char}"
|
131
|
+
end
|
132
|
+
|
133
|
+
# regexp looks something like
|
134
|
+
# /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
|
135
|
+
return true if (unesc_uri =~ %r{\A#{regexp}}mi)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
nil
|
140
|
+
end
|
141
|
+
|
142
|
+
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
143
|
+
def unescapeHTML(str, xml = true)
|
144
|
+
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Adds entities where possible.
|
148
|
+
# Works like CGI.escapeHTML, but will not escape existing entities;
|
149
|
+
# i.e. { will NOT become &#123;
|
150
|
+
#
|
151
|
+
# This method could be improved by adding a whitelist of html entities.
|
152
|
+
def add_entities(str)
|
153
|
+
str.to_s.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
# Everything below elment, or the just return the doc if element not present.
|
159
|
+
def subtree(doc, element)
|
160
|
+
doc.at("//#{element}/*") || doc
|
161
|
+
end
|
162
|
+
|
163
|
+
def remove_tags!(doc, tags)
|
164
|
+
(doc/tags.join(",")).remove unless tags.empty?
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
module Enumerable #:nodoc:
|
173
|
+
def build_hash
|
174
|
+
result = {}
|
175
|
+
self.each do |elt|
|
176
|
+
key, value = yield elt
|
177
|
+
result[key] = value
|
178
|
+
end
|
179
|
+
result
|
180
|
+
end
|
181
|
+
end
|
data/lib/parsers/rss.rb
CHANGED
@@ -1,98 +1,113 @@
|
|
1
|
-
require 'rss'
|
2
|
-
|
3
|
-
# For some reason, this is only included in the RDF Item by default.
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
# channel
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
:
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
1
|
+
require 'rss'
|
2
|
+
|
3
|
+
# For some reason, this is only included in the RDF Item by default (in 0.1.6).
|
4
|
+
unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
|
5
|
+
class RSS::Rss::Channel::Item # :nodoc:
|
6
|
+
include RSS::ContentModel
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Add equality onto Enclosures.
|
11
|
+
class RSS::Rss::Channel::Item::Enclosure
|
12
|
+
def eql?(enc)
|
13
|
+
instance_variables.all? do |iv|
|
14
|
+
instance_variable_get(iv) == enc.instance_variable_get(iv)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
alias == eql?
|
19
|
+
end
|
20
|
+
|
21
|
+
module FeedNormalizer
|
22
|
+
class RubyRssParser < Parser
|
23
|
+
|
24
|
+
def self.parser
|
25
|
+
RSS::Parser
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.parse(xml, loose)
|
29
|
+
begin
|
30
|
+
rss = parser.parse(xml)
|
31
|
+
rescue Exception => e
|
32
|
+
#puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
|
36
|
+
# check for channel to make sure we're only dealing with RSS.
|
37
|
+
rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Fairly high priority; a fast and strict parser.
|
41
|
+
def self.priority
|
42
|
+
100
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def self.package(rss, loose)
|
48
|
+
feed = Feed.new(self)
|
49
|
+
|
50
|
+
# channel elements
|
51
|
+
feed_mapping = {
|
52
|
+
:generator => :generator,
|
53
|
+
:title => :title,
|
54
|
+
:urls => :link,
|
55
|
+
:description => :description,
|
56
|
+
:copyright => :copyright,
|
57
|
+
:authors => :managingEditor,
|
58
|
+
:last_updated => [:lastBuildDate, :pubDate, :dc_date],
|
59
|
+
:id => :guid,
|
60
|
+
:ttl => :ttl
|
61
|
+
}
|
62
|
+
|
63
|
+
# make two passes, to catch all possible root elements
|
64
|
+
map_functions!(feed_mapping, rss, feed)
|
65
|
+
map_functions!(feed_mapping, rss.channel, feed)
|
66
|
+
|
67
|
+
# custom channel elements
|
68
|
+
feed.image = rss.image ? rss.image.url : nil
|
69
|
+
feed.skip_hours = skip(rss, :skipHours)
|
70
|
+
feed.skip_days = skip(rss, :skipDays)
|
71
|
+
|
72
|
+
# item elements
|
73
|
+
item_mapping = {
|
74
|
+
:date_published => [:pubDate, :dc_date],
|
75
|
+
:urls => :link,
|
76
|
+
:enclosures => :enclosure,
|
77
|
+
:description => :description,
|
78
|
+
:content => [:content_encoded, :description],
|
79
|
+
:title => :title,
|
80
|
+
:authors => [:author, :dc_creator],
|
81
|
+
:last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
|
82
|
+
}
|
83
|
+
|
84
|
+
rss.items.each do |rss_item|
|
85
|
+
feed_entry = Entry.new
|
86
|
+
map_functions!(item_mapping, rss_item, feed_entry)
|
87
|
+
|
88
|
+
# custom item elements
|
89
|
+
feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
|
90
|
+
feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
|
91
|
+
feed_entry.categories = loose ?
|
92
|
+
rss_item.categories.collect{|c|c.content} :
|
93
|
+
[rss_item.categories.first.content] rescue []
|
94
|
+
|
95
|
+
feed.entries << feed_entry
|
96
|
+
end
|
97
|
+
|
98
|
+
feed
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.skip(parser, attribute)
|
102
|
+
case attribute
|
103
|
+
when :skipHours then attributes = :hours
|
104
|
+
when :skipDays then attributes = :days
|
105
|
+
end
|
91
106
|
channel = parser.channel
|
92
107
|
|
93
108
|
return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
|
94
|
-
a.send(attributes).collect{|e| e.content}
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
end
|
109
|
+
a.send(attributes).collect{|e| e.content}
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|