feed-normalizer 1.5.1 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +48 -48
- data/License.txt +27 -27
- data/Manifest.txt +18 -19
- data/README.txt +63 -63
- data/Rakefile +29 -25
- data/lib/feed-normalizer.rb +149 -149
- data/lib/html-cleaner.rb +181 -190
- data/lib/parsers/rss.rb +110 -95
- data/lib/parsers/simple-rss.rb +138 -137
- data/lib/structures.rb +245 -244
- data/test/data/atom03.xml +128 -127
- data/test/data/atom10.xml +114 -112
- data/test/data/rdf10.xml +1498 -1498
- data/test/data/rss20.xml +64 -63
- data/test/data/rss20diff.xml +59 -59
- data/test/data/rss20diff_short.xml +51 -51
- data/test/test_feednormalizer.rb +265 -267
- data/test/test_htmlcleaner.rb +156 -155
- metadata +99 -63
- data/test/test_all.rb +0 -6
data/lib/html-cleaner.rb
CHANGED
@@ -1,190 +1,181 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'hpricot'
|
3
|
-
require 'cgi'
|
4
|
-
|
5
|
-
module FeedNormalizer
|
6
|
-
|
7
|
-
# Various methods for cleaning up HTML and preparing it for safe public
|
8
|
-
# consumption.
|
9
|
-
#
|
10
|
-
# Documents used for refrence:
|
11
|
-
# - http://www.w3.org/TR/html4/index/attributes.html
|
12
|
-
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
13
|
-
# - http://feedparser.org/docs/html-sanitization.html
|
14
|
-
# - http://code.whytheluckystiff.net/hpricot/wiki
|
15
|
-
class HtmlCleaner
|
16
|
-
|
17
|
-
# allowed html elements.
|
18
|
-
HTML_ELEMENTS = %w(
|
19
|
-
a abbr acronym address area b bdo big blockquote br button caption center
|
20
|
-
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
21
|
-
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
22
|
-
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
23
|
-
u ul var
|
24
|
-
)
|
25
|
-
|
26
|
-
# allowed attributes.
|
27
|
-
HTML_ATTRS = %w(
|
28
|
-
abbr accept accept-charset accesskey align alt axis border cellpadding
|
29
|
-
cellspacing char charoff charset checked cite class clear cols colspan
|
30
|
-
color compact coords datetime dir disabled for frame headers height href
|
31
|
-
hreflang hspace id ismap label lang longdesc maxlength media method
|
32
|
-
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
33
|
-
scope selected shape size span src start summary tabindex target title
|
34
|
-
type usemap valign value vspace width
|
35
|
-
)
|
36
|
-
|
37
|
-
# allowed attributes, but they can contain URIs, extra caution required.
|
38
|
-
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
39
|
-
HTML_URI_ATTRS = %w(
|
40
|
-
href src cite usemap longdesc
|
41
|
-
)
|
42
|
-
|
43
|
-
DODGY_URI_SCHEMES = %w(
|
44
|
-
javascript vbscript mocha livescript data
|
45
|
-
)
|
46
|
-
|
47
|
-
class << self
|
48
|
-
|
49
|
-
# Does this:
|
50
|
-
# - Unescape HTML
|
51
|
-
# - Parse HTML into tree
|
52
|
-
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
53
|
-
# - Each tag:
|
54
|
-
# - remove tag if not whitelisted
|
55
|
-
# - escape HTML tag contents
|
56
|
-
# - remove all attributes not on whitelist
|
57
|
-
# - extra-scrub URI attrs; see dodgy_uri?
|
58
|
-
#
|
59
|
-
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
|
-
def clean(str)
|
61
|
-
str = unescapeHTML(str)
|
62
|
-
|
63
|
-
doc = Hpricot(str, :fixup_tags => true)
|
64
|
-
doc = subtree(doc, :body)
|
65
|
-
|
66
|
-
# get all the tags in the document
|
67
|
-
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
|
-
# including text nodes instead of just tagged elements.
|
69
|
-
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
|
-
|
71
|
-
# Remove tags that aren't whitelisted.
|
72
|
-
remove_tags!(doc, tags - HTML_ELEMENTS)
|
73
|
-
remaining_tags = tags & HTML_ELEMENTS
|
74
|
-
|
75
|
-
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
76
|
-
(doc/remaining_tags.join(",")).each do |element|
|
77
|
-
element.raw_attributes.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
#
|
95
|
-
# -
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
doc
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
#
|
113
|
-
#
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
#
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
#
|
148
|
-
#
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
# From: Chris Gehlker <canyonrat mac.com>
|
183
|
-
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
184
|
-
class Hpricot::Text #:nodoc:
|
185
|
-
def set(string)
|
186
|
-
@content = string
|
187
|
-
self.raw_string = string
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
module FeedNormalizer
|
6
|
+
|
7
|
+
# Various methods for cleaning up HTML and preparing it for safe public
|
8
|
+
# consumption.
|
9
|
+
#
|
10
|
+
# Documents used for refrence:
|
11
|
+
# - http://www.w3.org/TR/html4/index/attributes.html
|
12
|
+
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
13
|
+
# - http://feedparser.org/docs/html-sanitization.html
|
14
|
+
# - http://code.whytheluckystiff.net/hpricot/wiki
|
15
|
+
class HtmlCleaner
|
16
|
+
|
17
|
+
# allowed html elements.
|
18
|
+
HTML_ELEMENTS = %w(
|
19
|
+
a abbr acronym address area b bdo big blockquote br button caption center
|
20
|
+
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
21
|
+
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
22
|
+
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
23
|
+
u ul var
|
24
|
+
)
|
25
|
+
|
26
|
+
# allowed attributes.
|
27
|
+
HTML_ATTRS = %w(
|
28
|
+
abbr accept accept-charset accesskey align alt axis border cellpadding
|
29
|
+
cellspacing char charoff charset checked cite class clear cols colspan
|
30
|
+
color compact coords datetime dir disabled for frame headers height href
|
31
|
+
hreflang hspace id ismap label lang longdesc maxlength media method
|
32
|
+
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
33
|
+
scope selected shape size span src start summary tabindex target title
|
34
|
+
type usemap valign value vspace width
|
35
|
+
)
|
36
|
+
|
37
|
+
# allowed attributes, but they can contain URIs, extra caution required.
|
38
|
+
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
39
|
+
HTML_URI_ATTRS = %w(
|
40
|
+
href src cite usemap longdesc
|
41
|
+
)
|
42
|
+
|
43
|
+
DODGY_URI_SCHEMES = %w(
|
44
|
+
javascript vbscript mocha livescript data
|
45
|
+
)
|
46
|
+
|
47
|
+
class << self
|
48
|
+
|
49
|
+
# Does this:
|
50
|
+
# - Unescape HTML
|
51
|
+
# - Parse HTML into tree
|
52
|
+
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
53
|
+
# - Each tag:
|
54
|
+
# - remove tag if not whitelisted
|
55
|
+
# - escape HTML tag contents
|
56
|
+
# - remove all attributes not on whitelist
|
57
|
+
# - extra-scrub URI attrs; see dodgy_uri?
|
58
|
+
#
|
59
|
+
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
|
+
def clean(str)
|
61
|
+
str = unescapeHTML(str)
|
62
|
+
|
63
|
+
doc = Hpricot(str, :fixup_tags => true)
|
64
|
+
doc = subtree(doc, :body)
|
65
|
+
|
66
|
+
# get all the tags in the document
|
67
|
+
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
|
+
# including text nodes instead of just tagged elements.
|
69
|
+
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
|
+
|
71
|
+
# Remove tags that aren't whitelisted.
|
72
|
+
remove_tags!(doc, tags - HTML_ELEMENTS)
|
73
|
+
remaining_tags = tags & HTML_ELEMENTS
|
74
|
+
|
75
|
+
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
76
|
+
(doc/remaining_tags.join(",")).each do |element|
|
77
|
+
next if element.raw_attributes.nil? || element.raw_attributes.empty?
|
78
|
+
element.raw_attributes.reject! do |attr,val|
|
79
|
+
!HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
|
80
|
+
end
|
81
|
+
|
82
|
+
element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
|
83
|
+
end unless remaining_tags.empty?
|
84
|
+
|
85
|
+
doc.traverse_text do |t|
|
86
|
+
t.swap(add_entities(t.to_html))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Return the tree, without comments. Ugly way of removing comments,
|
90
|
+
# but can't see a way to do this in Hpricot yet.
|
91
|
+
doc.to_s.gsub(/<\!--.*?-->/mi, '')
|
92
|
+
end
|
93
|
+
|
94
|
+
# For all other feed elements:
|
95
|
+
# - Unescape HTML.
|
96
|
+
# - Parse HTML into tree (taking 'body' as root, if present)
|
97
|
+
# - Takes text out of each tag, and escapes HTML.
|
98
|
+
# - Returns all text concatenated.
|
99
|
+
def flatten(str)
|
100
|
+
str.gsub!("\n", " ")
|
101
|
+
str = unescapeHTML(str)
|
102
|
+
|
103
|
+
doc = Hpricot(str, :xhtml_strict => true)
|
104
|
+
doc = subtree(doc, :body)
|
105
|
+
|
106
|
+
out = []
|
107
|
+
doc.traverse_text {|t| out << add_entities(t.to_html)}
|
108
|
+
|
109
|
+
return out.join
|
110
|
+
end
|
111
|
+
|
112
|
+
# Returns true if the given string contains a suspicious URL,
|
113
|
+
# i.e. a javascript link.
|
114
|
+
#
|
115
|
+
# This method rejects javascript, vbscript, livescript, mocha and data URLs.
|
116
|
+
# It *could* be refined to only deny dangerous data URLs, however.
|
117
|
+
def dodgy_uri?(uri)
|
118
|
+
uri = uri.to_s
|
119
|
+
|
120
|
+
# special case for poorly-formed entities (missing ';')
|
121
|
+
# if these occur *anywhere* within the string, then throw it out.
|
122
|
+
return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
|
123
|
+
|
124
|
+
# Try escaping as both HTML or URI encodings, and then trying
|
125
|
+
# each scheme regexp on each
|
126
|
+
[unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
|
127
|
+
DODGY_URI_SCHEMES.each do |scheme|
|
128
|
+
|
129
|
+
regexp = "#{scheme}:".gsub(/./) do |char|
|
130
|
+
"([\000-\037\177\s]*)#{char}"
|
131
|
+
end
|
132
|
+
|
133
|
+
# regexp looks something like
|
134
|
+
# /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
|
135
|
+
return true if (unesc_uri =~ %r{\A#{regexp}}mi)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
nil
|
140
|
+
end
|
141
|
+
|
142
|
+
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
143
|
+
def unescapeHTML(str, xml = true)
|
144
|
+
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Adds entities where possible.
|
148
|
+
# Works like CGI.escapeHTML, but will not escape existing entities;
|
149
|
+
# i.e. { will NOT become &#123;
|
150
|
+
#
|
151
|
+
# This method could be improved by adding a whitelist of html entities.
|
152
|
+
def add_entities(str)
|
153
|
+
str.to_s.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
# Everything below elment, or the just return the doc if element not present.
|
159
|
+
def subtree(doc, element)
|
160
|
+
doc.at("//#{element}/*") || doc
|
161
|
+
end
|
162
|
+
|
163
|
+
def remove_tags!(doc, tags)
|
164
|
+
(doc/tags.join(",")).remove unless tags.empty?
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
module Enumerable #:nodoc:
|
173
|
+
def build_hash
|
174
|
+
result = {}
|
175
|
+
self.each do |elt|
|
176
|
+
key, value = yield elt
|
177
|
+
result[key] = value
|
178
|
+
end
|
179
|
+
result
|
180
|
+
end
|
181
|
+
end
|
data/lib/parsers/rss.rb
CHANGED
@@ -1,98 +1,113 @@
|
|
1
|
-
require 'rss'
|
2
|
-
|
3
|
-
# For some reason, this is only included in the RDF Item by default.
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
# channel
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
:
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
1
|
+
require 'rss'
|
2
|
+
|
3
|
+
# For some reason, this is only included in the RDF Item by default (in 0.1.6).
|
4
|
+
unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
|
5
|
+
class RSS::Rss::Channel::Item # :nodoc:
|
6
|
+
include RSS::ContentModel
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Add equality onto Enclosures.
|
11
|
+
class RSS::Rss::Channel::Item::Enclosure
|
12
|
+
def eql?(enc)
|
13
|
+
instance_variables.all? do |iv|
|
14
|
+
instance_variable_get(iv) == enc.instance_variable_get(iv)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
alias == eql?
|
19
|
+
end
|
20
|
+
|
21
|
+
module FeedNormalizer
|
22
|
+
class RubyRssParser < Parser
|
23
|
+
|
24
|
+
def self.parser
|
25
|
+
RSS::Parser
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.parse(xml, loose)
|
29
|
+
begin
|
30
|
+
rss = parser.parse(xml)
|
31
|
+
rescue Exception => e
|
32
|
+
#puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
|
36
|
+
# check for channel to make sure we're only dealing with RSS.
|
37
|
+
rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Fairly high priority; a fast and strict parser.
|
41
|
+
def self.priority
|
42
|
+
100
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def self.package(rss, loose)
|
48
|
+
feed = Feed.new(self)
|
49
|
+
|
50
|
+
# channel elements
|
51
|
+
feed_mapping = {
|
52
|
+
:generator => :generator,
|
53
|
+
:title => :title,
|
54
|
+
:urls => :link,
|
55
|
+
:description => :description,
|
56
|
+
:copyright => :copyright,
|
57
|
+
:authors => :managingEditor,
|
58
|
+
:last_updated => [:lastBuildDate, :pubDate, :dc_date],
|
59
|
+
:id => :guid,
|
60
|
+
:ttl => :ttl
|
61
|
+
}
|
62
|
+
|
63
|
+
# make two passes, to catch all possible root elements
|
64
|
+
map_functions!(feed_mapping, rss, feed)
|
65
|
+
map_functions!(feed_mapping, rss.channel, feed)
|
66
|
+
|
67
|
+
# custom channel elements
|
68
|
+
feed.image = rss.image ? rss.image.url : nil
|
69
|
+
feed.skip_hours = skip(rss, :skipHours)
|
70
|
+
feed.skip_days = skip(rss, :skipDays)
|
71
|
+
|
72
|
+
# item elements
|
73
|
+
item_mapping = {
|
74
|
+
:date_published => [:pubDate, :dc_date],
|
75
|
+
:urls => :link,
|
76
|
+
:enclosures => :enclosure,
|
77
|
+
:description => :description,
|
78
|
+
:content => [:content_encoded, :description],
|
79
|
+
:title => :title,
|
80
|
+
:authors => [:author, :dc_creator],
|
81
|
+
:last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
|
82
|
+
}
|
83
|
+
|
84
|
+
rss.items.each do |rss_item|
|
85
|
+
feed_entry = Entry.new
|
86
|
+
map_functions!(item_mapping, rss_item, feed_entry)
|
87
|
+
|
88
|
+
# custom item elements
|
89
|
+
feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
|
90
|
+
feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
|
91
|
+
feed_entry.categories = loose ?
|
92
|
+
rss_item.categories.collect{|c|c.content} :
|
93
|
+
[rss_item.categories.first.content] rescue []
|
94
|
+
|
95
|
+
feed.entries << feed_entry
|
96
|
+
end
|
97
|
+
|
98
|
+
feed
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.skip(parser, attribute)
|
102
|
+
case attribute
|
103
|
+
when :skipHours then attributes = :hours
|
104
|
+
when :skipDays then attributes = :days
|
105
|
+
end
|
91
106
|
channel = parser.channel
|
92
107
|
|
93
108
|
return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
|
94
|
-
a.send(attributes).collect{|e| e.content}
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
end
|
109
|
+
a.send(attributes).collect{|e| e.content}
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|