feed-normalizer 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +14 -0
- data/{LICENSE → License.txt} +0 -0
- data/Manifest.txt +19 -0
- data/Rakefile +16 -40
- data/{README → Readme.txt} +12 -1
- data/lib/feed-normalizer.rb +6 -10
- data/lib/html-cleaner.rb +186 -0
- data/lib/parsers/rss.rb +1 -1
- data/lib/parsers/simple-rss.rb +3 -5
- data/lib/structures.rb +105 -35
- data/test/data/rss20diff.xml +49 -0
- data/test/data/rss20diff_short.xml +40 -0
- data/test/test_all.rb +6 -0
- data/test/test_feednormalizer.rb +137 -0
- data/test/test_htmlcleaner.rb +151 -0
- metadata +38 -16
- data/RELEASE +0 -13
- data/test/base_test.rb +0 -82
data/History.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
1.2.0
|
3
|
+
|
4
|
+
* Added HtmlCleaner - sanitizes HTML and removes 'bad' URIs to a level suitable
|
5
|
+
for 'safe' display inside a web browser. Can be used as a standalone library,
|
6
|
+
or as part of the Feed object. See Feed.clean! for details about cleaning a
|
7
|
+
Feed instance. Also see HtmlCleaner and its unit tests. Uses Hpricot.
|
8
|
+
* Added Feed-diffing. Differences between two feeds can be displayed using
|
9
|
+
Feed.diff. Works nicely with YAML for a readable diff.
|
10
|
+
* FeedNormalizer.parse now takes a hash for its arguments.
|
11
|
+
* Removed FN::Content.
|
12
|
+
* Now uses Hoe!
|
13
|
+
|
14
|
+
|
data/{LICENSE → License.txt}
RENAMED
File without changes
|
data/Manifest.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
History.txt
|
2
|
+
License.txt
|
3
|
+
Manifest.txt
|
4
|
+
Rakefile
|
5
|
+
Readme.txt
|
6
|
+
lib/feed-normalizer.rb
|
7
|
+
lib/html-cleaner.rb
|
8
|
+
lib/parsers/rss.rb
|
9
|
+
lib/parsers/simple-rss.rb
|
10
|
+
lib/structures.rb
|
11
|
+
test/data/atom03.xml
|
12
|
+
test/data/atom10.xml
|
13
|
+
test/data/rdf10.xml
|
14
|
+
test/data/rss20.xml
|
15
|
+
test/data/rss20diff.xml
|
16
|
+
test/data/rss20diff_short.xml
|
17
|
+
test/test_all.rb
|
18
|
+
test/test_feednormalizer.rb
|
19
|
+
test/test_htmlcleaner.rb
|
data/Rakefile
CHANGED
@@ -1,49 +1,25 @@
|
|
1
|
-
require '
|
2
|
-
require 'rake'
|
3
|
-
require 'rake/testtask'
|
4
|
-
require 'rake/rdoctask'
|
5
|
-
require 'rake/clean'
|
6
|
-
require 'rake/gempackagetask'
|
1
|
+
require 'hoe'
|
7
2
|
|
8
|
-
|
9
|
-
"lib/**/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
|
10
|
-
]
|
11
|
-
|
12
|
-
Gem::manage_gems
|
13
|
-
|
14
|
-
task :default => [:test]
|
15
|
-
task :package => [:test, :doc]
|
16
|
-
|
17
|
-
spec = Gem::Specification.new do |s|
|
18
|
-
s.name = "feed-normalizer"
|
19
|
-
s.version = "1.1.0"
|
3
|
+
Hoe.new("feed-normalizer", "1.2.0") do |s|
|
20
4
|
s.author = "Andrew A. Smith"
|
21
5
|
s.email = "andy@tinnedfruit.org"
|
22
|
-
s.
|
23
|
-
s.platform = Gem::Platform::RUBY
|
6
|
+
s.url = "http://feed-normalizer.rubyforge.org/"
|
24
7
|
s.summary = "Extensible Ruby wrapper for Atom and RSS parsers"
|
25
|
-
s.
|
26
|
-
s.
|
27
|
-
s.
|
28
|
-
s.
|
29
|
-
s.
|
8
|
+
s.description = s.paragraphs_of('Readme.txt', 1..2).join("\n\n")
|
9
|
+
s.changes = s.paragraphs_of('History.txt', 0..1).join("\n\n")
|
10
|
+
s.extra_deps << ["simple-rss", ">= 1.1"]
|
11
|
+
s.extra_deps << ["hpricot", ">= 0.4"]
|
12
|
+
s.need_zip = true
|
13
|
+
s.need_tar = false
|
30
14
|
end
|
31
15
|
|
32
|
-
Rake::GemPackageTask.new(spec) do |pkg|
|
33
|
-
pkg.need_zip = true
|
34
|
-
end
|
35
|
-
|
36
|
-
Rake::TestTask.new do |t|
|
37
|
-
t.libs << "test"
|
38
|
-
t.test_files = FileList['test/*_test.rb']
|
39
|
-
t.verbose = true
|
40
|
-
end
|
41
16
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
17
|
+
begin
|
18
|
+
require 'rcov/rcovtask'
|
19
|
+
Rcov::RcovTask.new("rcov") do |t|
|
20
|
+
t.test_files = Dir['test/test_all.rb']
|
21
|
+
end
|
22
|
+
rescue LoadError
|
23
|
+
nil
|
48
24
|
end
|
49
25
|
|
data/{README → Readme.txt}
RENAMED
@@ -38,6 +38,17 @@ The feed representation stays the same, even though a different parser was used.
|
|
38
38
|
feed.class # => FeedNormalizer::Feed
|
39
39
|
feed.parser # => SimpleRSS
|
40
40
|
|
41
|
+
== Cleaning / Sanitizing
|
42
|
+
|
43
|
+
feed.title # => "My Feed > Your Feed"
|
44
|
+
feed.entries.first.content # => "<p x='y'>Hello</p><object></object></html>"
|
45
|
+
feed.clean!
|
46
|
+
|
47
|
+
All elements should now be either clean HTML, or HTML escaped strings.
|
48
|
+
|
49
|
+
feed.title # => "My Feed > Your Feed"
|
50
|
+
feed.entries.first.content # => "<p>Hello</p>"
|
51
|
+
|
41
52
|
== Extending
|
42
53
|
|
43
54
|
Implement a parser wrapper by extending the FeedNormalizer::Parser class and overriding
|
@@ -49,4 +60,4 @@ See FeedNormalizer::RubyRssParser and FeedNormalizer::SimpleRssParser for exampl
|
|
49
60
|
== Authors
|
50
61
|
* Andrew A. Smith (andy@tinnedfruit.org)
|
51
62
|
|
52
|
-
This library is released under the terms of the BSD License (see the
|
63
|
+
This library is released under the terms of the BSD License (see the License.txt file for details).
|
data/lib/feed-normalizer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'structures'
|
2
|
+
require 'html-cleaner'
|
2
3
|
|
3
4
|
module FeedNormalizer
|
4
5
|
|
@@ -88,21 +89,16 @@ module FeedNormalizer
|
|
88
89
|
# used first, and if try_others is false, it is the only parser used,
|
89
90
|
# otherwise all parsers in the ParserRegistry are attempted next, in
|
90
91
|
# order of priority.
|
91
|
-
def self.parse(xml,
|
92
|
+
def self.parse(xml, opts = {})
|
92
93
|
|
93
94
|
# Get a string ASAP, as multiple read()'s will start returning nil..
|
94
95
|
xml = xml.respond_to?(:read) ? xml.read : xml.to_s
|
95
96
|
|
96
|
-
if
|
97
|
-
result =
|
97
|
+
if opts[:force_parser]
|
98
|
+
result = opts[:force_parser].parse(xml)
|
98
99
|
|
99
|
-
if result
|
100
|
-
|
101
|
-
elsif !try_others
|
102
|
-
return nil
|
103
|
-
else
|
104
|
-
# fall through and continue with other parsers
|
105
|
-
end
|
100
|
+
return result if result
|
101
|
+
return nil if opts[:try_others] == false
|
106
102
|
end
|
107
103
|
|
108
104
|
ParserRegistry.parsers.each do |parser|
|
data/lib/html-cleaner.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
module FeedNormalizer
|
5
|
+
|
6
|
+
# Various methods for cleaning up HTML and preparing it for safe public
|
7
|
+
# consumption.
|
8
|
+
#
|
9
|
+
# Documents used for refrence:
|
10
|
+
# - http://www.w3.org/TR/html4/index/attributes.html
|
11
|
+
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
12
|
+
# - http://feedparser.org/docs/html-sanitization.html
|
13
|
+
# - http://code.whytheluckystiff.net/hpricot/wiki
|
14
|
+
class HtmlCleaner
|
15
|
+
|
16
|
+
# allowed html elements.
|
17
|
+
HTML_ELEMENTS = %w(
|
18
|
+
a abbr acronym address area b bdo big blockquote br button caption center
|
19
|
+
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
20
|
+
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
21
|
+
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
22
|
+
u ul var
|
23
|
+
)
|
24
|
+
|
25
|
+
# allowed attributes.
|
26
|
+
HTML_ATTRS = %w(
|
27
|
+
abbr accept accept-charset accesskey align alt axis border cellpadding
|
28
|
+
cellspacing char charoff charset checked cite class clear cols colspan
|
29
|
+
color compact coords datetime dir disabled for frame headers height href
|
30
|
+
hreflang hspace id ismap label lang longdesc maxlength media method
|
31
|
+
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
32
|
+
scope selected shape size span src start summary tabindex target title
|
33
|
+
type usemap valign value vspace width
|
34
|
+
)
|
35
|
+
|
36
|
+
# allowed attributes, but they can contain URIs, extra caution required.
|
37
|
+
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
38
|
+
HTML_URI_ATTRS = %w(
|
39
|
+
href src cite usemap longdesc
|
40
|
+
)
|
41
|
+
|
42
|
+
DODGY_URI_SCHEMES = %w(
|
43
|
+
javascript vbscript mocha livescript data
|
44
|
+
)
|
45
|
+
|
46
|
+
class << self
|
47
|
+
|
48
|
+
# Does this:
|
49
|
+
# - Unescape HTML
|
50
|
+
# - Parse HTML into tree
|
51
|
+
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
52
|
+
# - Each tag:
|
53
|
+
# - remove tag if not whitelisted
|
54
|
+
# - escape HTML tag contents
|
55
|
+
# - remove all attributes not on whitelist
|
56
|
+
# - extra-scrub URI attrs; see dodgy_uri?
|
57
|
+
#
|
58
|
+
# Extra (i.e. unmatched) ending tags and comments are removed.
|
59
|
+
def clean(str)
|
60
|
+
str = unescapeHTML(str)
|
61
|
+
|
62
|
+
doc = Hpricot(str, :xhtml_strict => true)
|
63
|
+
doc = subtree(doc, :body)
|
64
|
+
|
65
|
+
# get all the tags in the document
|
66
|
+
tags = (doc/"*").collect {|e| e.name}
|
67
|
+
|
68
|
+
# Remove tags that aren't whitelisted.
|
69
|
+
remove_tags!(doc, tags - HTML_ELEMENTS)
|
70
|
+
remaining_tags = tags & HTML_ELEMENTS
|
71
|
+
|
72
|
+
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
73
|
+
(doc/remaining_tags.join(",")).each do |element|
|
74
|
+
element.attributes.reject! do |attr,val|
|
75
|
+
!HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
|
76
|
+
end
|
77
|
+
|
78
|
+
element.attributes = element.attributes.build_hash {|a,v| [a, add_entities(v)]}
|
79
|
+
end unless remaining_tags.empty?
|
80
|
+
|
81
|
+
doc.traverse_text {|t| t.set(add_entities(t.to_s))}
|
82
|
+
|
83
|
+
# Return the tree, without comments. Ugly way of removing comments,
|
84
|
+
# but can't see a way to do this in Hpricot yet.
|
85
|
+
doc.to_s.gsub(/<\!--.*-->/mi, '')
|
86
|
+
end
|
87
|
+
|
88
|
+
# For all other feed elements:
|
89
|
+
# - Unescape HTML.
|
90
|
+
# - Parse HTML into tree (taking 'body' as root, if present)
|
91
|
+
# - Takes text out of each tag, and escapes HTML.
|
92
|
+
# - Returns all text concatenated.
|
93
|
+
def flatten(str)
|
94
|
+
str.gsub!("\n", " ")
|
95
|
+
str = unescapeHTML(str)
|
96
|
+
|
97
|
+
doc = Hpricot(str, :xhtml_strict => true)
|
98
|
+
doc = subtree(doc, :body)
|
99
|
+
|
100
|
+
out = ""
|
101
|
+
doc.traverse_text {|t| out << add_entities(t.to_s)}
|
102
|
+
|
103
|
+
return out
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns true if the given string contains a suspicious URL,
|
107
|
+
# i.e. a javascript link.
|
108
|
+
#
|
109
|
+
# This method rejects javascript, vbscript, livescript, mocha and data URLs.
|
110
|
+
# It *could* be refined to only deny dangerous data URLs, however.
|
111
|
+
def dodgy_uri?(uri)
|
112
|
+
|
113
|
+
# special case for poorly-formed entities (missing ';')
|
114
|
+
# if these occur *anywhere* within the string, then throw it out.
|
115
|
+
return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
|
116
|
+
|
117
|
+
# Try escaping as both HTML or URI encodings, and then trying
|
118
|
+
# each scheme regexp on each
|
119
|
+
[unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
|
120
|
+
DODGY_URI_SCHEMES.each do |scheme|
|
121
|
+
|
122
|
+
regexp = "#{scheme}:".gsub(/./) do |char|
|
123
|
+
"([\000-\037\177\s]*)#{char}"
|
124
|
+
end
|
125
|
+
|
126
|
+
# regexp looks something like
|
127
|
+
# /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
|
128
|
+
return true if (unesc_uri =~ %r{\A#{regexp}}mi)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
nil
|
133
|
+
end
|
134
|
+
|
135
|
+
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
136
|
+
def unescapeHTML(str, xml = true)
|
137
|
+
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Adds entities where possible.
|
141
|
+
# Works like CGI.escapeHTML, but will not escape existing entities;
|
142
|
+
# i.e. { will NOT become &#123;
|
143
|
+
#
|
144
|
+
# This method could be improved by adding a whitelist of html entities.
|
145
|
+
def add_entities(str)
|
146
|
+
str.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
|
151
|
+
# Everything below elment, or the just return the doc if element not present.
|
152
|
+
def subtree(doc, element)
|
153
|
+
doc.at("//#{element}/*") || doc
|
154
|
+
end
|
155
|
+
|
156
|
+
def remove_tags!(doc, tags)
|
157
|
+
(doc/tags.join(",")).remove unless tags.empty?
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
module Enumerable
|
166
|
+
def build_hash
|
167
|
+
result = {}
|
168
|
+
self.each do |elt|
|
169
|
+
key, value = yield elt
|
170
|
+
result[key] = value
|
171
|
+
end
|
172
|
+
result
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
|
177
|
+
# Subject: A simple Hpricot text setter
|
178
|
+
# From: Chris Gehlker <canyonrat mac.com>
|
179
|
+
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
180
|
+
class Hpricot::Text
|
181
|
+
def set(string)
|
182
|
+
@content = string
|
183
|
+
self.raw_string = string
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
data/lib/parsers/rss.rb
CHANGED
@@ -52,6 +52,7 @@ module FeedNormalizer
|
|
52
52
|
:date_published => :pubDate,
|
53
53
|
:urls => :link,
|
54
54
|
:description => :description,
|
55
|
+
:content => :description,
|
55
56
|
:title => :title,
|
56
57
|
:authors => :author
|
57
58
|
}
|
@@ -62,7 +63,6 @@ module FeedNormalizer
|
|
62
63
|
|
63
64
|
# custom item elements
|
64
65
|
feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
|
65
|
-
feed_entry.content.body = rss_item.description
|
66
66
|
feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
|
67
67
|
|
68
68
|
feed.entries << feed_entry
|
data/lib/parsers/simple-rss.rb
CHANGED
@@ -53,6 +53,7 @@ module FeedNormalizer
|
|
53
53
|
:date_published => [:pubDate, :published],
|
54
54
|
:urls => :link,
|
55
55
|
:description => [:description, :summary],
|
56
|
+
:content => [:content, :description],
|
56
57
|
:title => :title,
|
57
58
|
:authors => [:author, :contributor]
|
58
59
|
}
|
@@ -64,7 +65,6 @@ module FeedNormalizer
|
|
64
65
|
# custom entry elements
|
65
66
|
feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
|
66
67
|
feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
|
67
|
-
feed_entry.content.body = atomrss_entry.content || atomrss_entry.description
|
68
68
|
|
69
69
|
feed.entries << feed_entry
|
70
70
|
end
|
@@ -74,7 +74,7 @@ module FeedNormalizer
|
|
74
74
|
|
75
75
|
def self.image(parser)
|
76
76
|
if parser.respond_to?(:image) && parser.image
|
77
|
-
if parser.image
|
77
|
+
if parser.image =~ /<url>/ # RSS image contains an <url> spec
|
78
78
|
parser.image.scan(/<url>(.*)<\/url>/).to_s
|
79
79
|
else
|
80
80
|
parser.image # Atom contains just the url
|
@@ -90,9 +90,7 @@ module FeedNormalizer
|
|
90
90
|
|
91
91
|
# gets the value returned from the method if it overriden, otherwise nil.
|
92
92
|
def self.overridden_value(object, method)
|
93
|
-
|
94
|
-
# Highly dependent upon Method's to_s :(
|
95
|
-
object.id if object.method(:id).to_s.match /SimpleRSS\#/
|
93
|
+
object.class.public_instance_methods(false).include? method
|
96
94
|
end
|
97
95
|
|
98
96
|
end
|
data/lib/structures.rb
CHANGED
@@ -10,14 +10,12 @@ module FeedNormalizer
|
|
10
10
|
# Example:
|
11
11
|
# Object contains an array called 'alphas', which looks like [:a, :b, :c].
|
12
12
|
# Call object.alpha and :a is returned.
|
13
|
-
def method_missing(name)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
nil
|
13
|
+
def method_missing(name, *args)
|
14
|
+
return self.send(:"#{name}s").first rescue super(name, *args)
|
15
|
+
end
|
16
|
+
|
17
|
+
def respond_to?(x, y=false)
|
18
|
+
self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
|
21
19
|
end
|
22
20
|
|
23
21
|
end
|
@@ -34,55 +32,126 @@ module FeedNormalizer
|
|
34
32
|
self.class::ELEMENTS.collect{|el| self.instance_variable_get("@#{el}")==other.instance_variable_get("@#{el}")}.all?)
|
35
33
|
end
|
36
34
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
35
|
+
# Returns the difference between two Feed instances as a hash.
|
36
|
+
# Any top-level differences in the Feed object as presented as:
|
37
|
+
#
|
38
|
+
# { :title => [content, other_content] }
|
39
|
+
#
|
40
|
+
# For differences at the items level, an array of hashes shows the diffs
|
41
|
+
# on a per-entry basis. Only entries that differ will contain a hash:
|
42
|
+
#
|
43
|
+
# { :items => [
|
44
|
+
# {:title => ["An article tile", "A new article title"]},
|
45
|
+
# {:title => ["one title", "a different title"]} ]}
|
46
|
+
#
|
47
|
+
# If the number of items in each feed are different, then the count of each
|
48
|
+
# is provided instead:
|
49
|
+
#
|
50
|
+
# { :items => [4,5] }
|
51
|
+
#
|
52
|
+
# This method can also be useful for human-readable feed comparison if
|
53
|
+
# its output is dumped to YAML.
|
54
|
+
def diff(other, elements = self.class::ELEMENTS)
|
55
|
+
diffs = {}
|
56
|
+
|
57
|
+
elements.each do |element|
|
58
|
+
if other.respond_to?(element)
|
59
|
+
self_value = self.send(element)
|
60
|
+
other_value = other.send(element)
|
61
|
+
|
62
|
+
next if self_value == other_value
|
63
|
+
|
64
|
+
diffs[element] = if other_value.respond_to?(:diff)
|
65
|
+
self_value.diff(other_value)
|
66
|
+
|
67
|
+
elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
|
68
|
+
|
69
|
+
if self_value.size != other_value.size
|
70
|
+
[self_value.size, other_value.size]
|
71
|
+
else
|
72
|
+
enum_diffs = []
|
73
|
+
self_value.each_with_index do |val, index|
|
74
|
+
enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
|
75
|
+
end
|
76
|
+
enum_diffs.reject{|h| h.empty?}
|
77
|
+
end
|
78
|
+
|
79
|
+
else
|
80
|
+
[other_value, self_value] unless other_value == self_value
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
43
84
|
|
44
|
-
|
45
|
-
@type = :text
|
85
|
+
diffs
|
46
86
|
end
|
47
87
|
|
48
|
-
|
49
|
-
body
|
50
|
-
end
|
88
|
+
end
|
51
89
|
|
52
|
-
|
53
|
-
|
54
|
-
|
90
|
+
module ElementCleaner
|
91
|
+
# Recursively cleans all elements in place.
|
92
|
+
#
|
93
|
+
# Only allow tags in whitelist. Always parse the html with a parser and delete
|
94
|
+
# all tags that arent on the list.
|
95
|
+
#
|
96
|
+
# For feed elements that can contain HTML:
|
97
|
+
# - feed.(title|description)
|
98
|
+
# - feed.entries[n].(title|description|content)
|
99
|
+
#
|
100
|
+
def clean!
|
101
|
+
self.class::SIMPLE_ELEMENTS.each do |element|
|
102
|
+
val = self.send(element)
|
55
103
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
104
|
+
send("#{element}=", (val.is_a?(Array) ?
|
105
|
+
val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
|
106
|
+
end
|
107
|
+
|
108
|
+
self.class::HTML_ELEMENTS.each do |element|
|
109
|
+
send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
|
110
|
+
end
|
111
|
+
|
112
|
+
self.class::BLENDED_ELEMENTS.each do |element|
|
113
|
+
self.send(element).collect{|v| v.clean!}
|
114
|
+
end
|
61
115
|
end
|
62
116
|
end
|
63
117
|
|
118
|
+
|
64
119
|
# Represents a feed item entry.
|
65
120
|
class Entry
|
66
|
-
include Singular, ElementEquality
|
121
|
+
include Singular, ElementEquality, ElementCleaner
|
122
|
+
|
123
|
+
HTML_ELEMENTS = [:content, :description, :title]
|
124
|
+
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright]
|
125
|
+
BLENDED_ELEMENTS = []
|
67
126
|
|
68
|
-
ELEMENTS =
|
69
|
-
|
127
|
+
ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
|
128
|
+
|
129
|
+
attr_accessor(*ELEMENTS)
|
70
130
|
|
71
131
|
def initialize
|
72
132
|
@urls = []
|
73
133
|
@authors = []
|
74
|
-
@content = Content.new
|
75
134
|
end
|
76
135
|
|
77
136
|
end
|
78
137
|
|
79
138
|
# Represents the root element of a feed.
|
80
139
|
class Feed
|
81
|
-
include Singular, ElementEquality
|
140
|
+
include Singular, ElementEquality, ElementCleaner
|
141
|
+
|
142
|
+
# Elements that can contain HTML fragments.
|
143
|
+
HTML_ELEMENTS = [:title, :description]
|
82
144
|
|
83
|
-
|
84
|
-
|
85
|
-
|
145
|
+
# Elements that contain 'plain' Strings, with HTML escaped.
|
146
|
+
SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator]
|
147
|
+
|
148
|
+
# Elements that contain both HTML and escaped HTML.
|
149
|
+
BLENDED_ELEMENTS = [:items]
|
150
|
+
|
151
|
+
ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
|
152
|
+
|
153
|
+
attr_accessor(*ELEMENTS)
|
154
|
+
attr_accessor(:parser)
|
86
155
|
|
87
156
|
alias :entries :items
|
88
157
|
|
@@ -95,6 +164,7 @@ module FeedNormalizer
|
|
95
164
|
end
|
96
165
|
|
97
166
|
def channel() self end
|
167
|
+
|
98
168
|
end
|
99
169
|
|
100
170
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
+
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
+
<rss version="2.0">
|
4
|
+
<channel>
|
5
|
+
<title>diff</title>
|
6
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
+
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
+
<language>en-gb</language>
|
9
|
+
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
+
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
+
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
+
<ttl>15</ttl>
|
13
|
+
|
14
|
+
<image>
|
15
|
+
<title>BBC News</title>
|
16
|
+
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
17
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
18
|
+
</image>
|
19
|
+
|
20
|
+
<item>
|
21
|
+
<title>diff</title>
|
22
|
+
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
23
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
24
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
25
|
+
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
26
|
+
<category>Click</category>
|
27
|
+
</item>
|
28
|
+
|
29
|
+
<item>
|
30
|
+
<title>diff</title>
|
31
|
+
<description>diff</description>
|
32
|
+
<link>diff</link>
|
33
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
34
|
+
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
35
|
+
<category>diff</category>
|
36
|
+
</item>
|
37
|
+
|
38
|
+
<item>
|
39
|
+
<title>MP3 player court order overturned</title>
|
40
|
+
<description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
41
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
42
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
43
|
+
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
44
|
+
<category>Technology</category>
|
45
|
+
</item>
|
46
|
+
|
47
|
+
</channel>
|
48
|
+
</rss>
|
49
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
+
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
+
<rss version="2.0">
|
4
|
+
<channel>
|
5
|
+
<title>diff</title>
|
6
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
+
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
+
<language>en-gb</language>
|
9
|
+
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
+
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
+
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
+
<ttl>15</ttl>
|
13
|
+
|
14
|
+
<image>
|
15
|
+
<title>BBC News</title>
|
16
|
+
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
17
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
18
|
+
</image>
|
19
|
+
|
20
|
+
<item>
|
21
|
+
<title>diff</title>
|
22
|
+
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
23
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
24
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
25
|
+
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
26
|
+
<category>Click</category>
|
27
|
+
</item>
|
28
|
+
|
29
|
+
<item>
|
30
|
+
<title>diff</title>
|
31
|
+
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
32
|
+
<link>diff</link>
|
33
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
34
|
+
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
35
|
+
<category>diff</category>
|
36
|
+
</item>
|
37
|
+
|
38
|
+
</channel>
|
39
|
+
</rss>
|
40
|
+
|
data/test/test_all.rb
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'feed-normalizer'
|
3
|
+
|
4
|
+
include FeedNormalizer
|
5
|
+
|
6
|
+
class FeedNormalizerTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
XML_FILES = {}
|
9
|
+
|
10
|
+
data_dir = File.dirname(__FILE__) + '/data'
|
11
|
+
|
12
|
+
# Load up the xml files
|
13
|
+
Dir.open(data_dir).each do |fn|
|
14
|
+
next unless fn =~ /[.]xml$/
|
15
|
+
XML_FILES[fn.scan(/(.*)[.]/).to_s.to_sym] = File.read(data_dir + "/#{fn}")
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_basic_parse
|
19
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_force_parser
|
23
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
24
|
+
:force_parser => RubyRssParser, :try_others => true)
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_force_parser_exclusive
|
28
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
29
|
+
:force_parser => RubyRssParser, :try_others => false)
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_ruby_rss_parser
|
33
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
34
|
+
:force_parser => RubyRssParser, :try_others => false)
|
35
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10],
|
36
|
+
:force_parser => RubyRssParser, :try_others => false)
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_simple_rss_parser
|
40
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
41
|
+
:force_parser => SimpleRssParser, :try_others => false)
|
42
|
+
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10],
|
43
|
+
:force_parser => SimpleRssParser, :try_others => false)
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_parser_failover_order
|
47
|
+
assert_equal SimpleRSS, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => RubyRssParser).parser
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_force_parser_fail
|
51
|
+
assert_nil FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => RubyRssParser, :try_others => false)
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_all_parsers_fail
|
55
|
+
assert_nil FeedNormalizer::FeedNormalizer.parse("This isn't RSS or Atom!")
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_correct_parser_used
|
59
|
+
assert_equal RSS::Parser, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]).parser
|
60
|
+
assert_equal SimpleRSS, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]).parser
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_rss
|
64
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
65
|
+
|
66
|
+
assert_equal "BBC News | Technology | UK Edition", feed.title
|
67
|
+
assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
|
68
|
+
assert_equal "MP3 player court order overturned", feed.entries.last.title
|
69
|
+
assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
70
|
+
assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.content
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_simplerss
|
74
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
75
|
+
|
76
|
+
assert_equal "~:caboose", feed.title
|
77
|
+
assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
|
78
|
+
assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
|
79
|
+
assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
|
80
|
+
|
81
|
+
assert !feed.entries.last.description.include?("google fame")
|
82
|
+
assert feed.entries.last.content.include?("google fame")
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_sanity_check
|
86
|
+
XML_FILES.keys.each do |xml_file|
|
87
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
|
88
|
+
|
89
|
+
assert [feed.title, feed.url, feed.entries.first.url].collect{|e| e.is_a?(String)}.all?, "Not everything was a String in #{xml_file}"
|
90
|
+
assert [feed.parser, feed.class].collect{|e| e.is_a?(Class)}.all?
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_feed_equality
|
95
|
+
assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
96
|
+
assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
97
|
+
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
98
|
+
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
99
|
+
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff])
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_feed_diff
|
103
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
104
|
+
|
105
|
+
diff = feed.diff(FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff]))
|
106
|
+
diff_short = feed.diff(FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff_short]))
|
107
|
+
no_diff = feed.diff(feed)
|
108
|
+
|
109
|
+
assert diff.keys.all? {|key| [:title, :items].include?(key)}
|
110
|
+
assert_equal 2, diff[:items].size
|
111
|
+
|
112
|
+
assert diff_short.keys.all? {|key| [:title, :items].include?(key)}
|
113
|
+
assert_equal [3,2], diff_short[:items]
|
114
|
+
|
115
|
+
assert no_diff.empty?
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_marshal
|
119
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
120
|
+
|
121
|
+
assert_nothing_raised { Marshal.load(Marshal.dump(feed)) }
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_method_missing
|
125
|
+
assert_raise(NoMethodError) { Feed.new(nil).nonexistant }
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_clean
|
129
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
130
|
+
|
131
|
+
assert feed.entries.first.content !~ /\<p\>/
|
132
|
+
feed.clean!
|
133
|
+
assert feed.entries.first.content =~ /\<p\>/
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'html-cleaner'
|
3
|
+
|
4
|
+
include FeedNormalizer
|
5
|
+
|
6
|
+
class HtmlCleanerTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_unescape
|
9
|
+
assert_equal "' ' °", FeedNormalizer::HtmlCleaner.unescapeHTML("' ' °")
|
10
|
+
assert_equal "\" °", FeedNormalizer::HtmlCleaner.unescapeHTML("" °")
|
11
|
+
assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("""""")
|
12
|
+
assert_equal "heavily subnet’d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet’d network,")
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_add_entities
|
16
|
+
assert_equal "x > y", HtmlCleaner.add_entities("x > y")
|
17
|
+
assert_equal "1 & 2", HtmlCleaner.add_entities("1 & 2")
|
18
|
+
assert_equal "& { ´ ģ", HtmlCleaner.add_entities("& { ´ ģ")
|
19
|
+
assert_equal "& { &ACUTE; ሺ ࠏ", HtmlCleaner.add_entities("& { &ACUTE; ሺ ࠏ")
|
20
|
+
assert_equal "heavily subnet’d network,", HtmlCleaner.add_entities("heavily subnet’d network,")
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_html_clean
|
24
|
+
assert_equal "", HtmlCleaner.clean("")
|
25
|
+
|
26
|
+
assert_equal "<p>foo > *</p>", HtmlCleaner.clean("<p>foo > *</p>")
|
27
|
+
assert_equal "<p>foo > *</p>", HtmlCleaner.clean("<p>foo > *</p>")
|
28
|
+
|
29
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
|
30
|
+
assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
|
31
|
+
|
32
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
|
33
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
|
34
|
+
|
35
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
|
36
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
|
37
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
|
38
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
|
39
|
+
|
40
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
|
41
|
+
|
42
|
+
assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
|
43
|
+
assert_equal "<a href=\"http://example.org/proc?a&b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
|
44
|
+
|
45
|
+
assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
|
46
|
+
assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
|
47
|
+
assert_equal "<p>para</p><bo /dy><p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
|
48
|
+
assert_equal "<p>para</p><bo\\/dy><p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
|
49
|
+
assert_equal "<p>para</p><p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
|
50
|
+
|
51
|
+
assert_equal "<p>one & two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
|
52
|
+
|
53
|
+
assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
|
54
|
+
assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
|
55
|
+
|
56
|
+
assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
|
57
|
+
assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
|
58
|
+
|
59
|
+
assert_equal "what's new", HtmlCleaner.clean("what's new")
|
60
|
+
assert_equal ""what's new?"", HtmlCleaner.clean("\"what's new?\"")
|
61
|
+
assert_equal ""what's new?"", HtmlCleaner.clean(""what's new?"")
|
62
|
+
|
63
|
+
# Real-world examples from selected feeds
|
64
|
+
assert_equal "I have a heavily subnet’d/vlan’d network,", HtmlCleaner.clean("I have a heavily subnet’d/vlan’d network,")
|
65
|
+
|
66
|
+
assert_equal "<pre><blockquote><%= start_form_tag :action => "create" %></blockquote></pre>",
|
67
|
+
HtmlCleaner.clean("<pre><blockquote><%= start_form_tag :action => \"create\" %></blockquote></pre>")
|
68
|
+
|
69
|
+
assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
|
70
|
+
HtmlCleaner.clean("<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>")
|
71
|
+
|
72
|
+
|
73
|
+
# Various exploits from the past
|
74
|
+
assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
|
75
|
+
assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>",
|
76
|
+
HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
|
77
|
+
assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
|
78
|
+
assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
|
79
|
+
assert_equal "<IMG """>">", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
|
80
|
+
|
81
|
+
# This doesnt come out as I would like, but the result is still safe.
|
82
|
+
# (Apparently, this would work in Gecko.)
|
83
|
+
assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
|
84
|
+
assert_equal "<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js">", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
|
85
|
+
|
86
|
+
assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
|
87
|
+
assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
|
88
|
+
assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_html_flatten
|
92
|
+
assert_equal "", HtmlCleaner.flatten("")
|
93
|
+
|
94
|
+
assert_equal "hello", HtmlCleaner.flatten("hello")
|
95
|
+
assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
|
96
|
+
|
97
|
+
assert_equal "A > B : C", HtmlCleaner.flatten("A > B : C")
|
98
|
+
assert_equal "what's new", HtmlCleaner.flatten("what's new")
|
99
|
+
assert_equal ""what's new?"", HtmlCleaner.flatten("\"what's new?\"")
|
100
|
+
|
101
|
+
assert_equal "we’ve got <a hre", HtmlCleaner.flatten("we’ve got <a hre")
|
102
|
+
|
103
|
+
assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
|
104
|
+
assert_equal "http://example.org/proc?a&b", HtmlCleaner.flatten("http://example.org/proc?a&b")
|
105
|
+
|
106
|
+
assert_equal ""what's new?"", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what's new?\""))
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_dodgy_uri
|
110
|
+
# All of these javascript urls work in IE6.
|
111
|
+
assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
|
112
|
+
assert HtmlCleaner.dodgy_uri?(" javascript \n :alert('HI');")
|
113
|
+
assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
|
114
|
+
assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
|
115
|
+
|
116
|
+
# entities lacking ending ';'
|
117
|
+
# This only works if they're all packed together without spacing.
|
118
|
+
assert HtmlCleaner.dodgy_uri?("javascript:alert('img-ob-2')")
|
119
|
+
assert HtmlCleaner.dodgy_uri?("javascript:alert('img-ob-2' ) ; ")
|
120
|
+
# catch extra spacing anyway.. support for this is possible, depending where the spaces are.
|
121
|
+
assert HtmlCleaner.dodgy_uri?("j a v a s c r i p t : a l e r t ( ' i m g - o b - 2 ' ) ; ")
|
122
|
+
assert HtmlCleaner.dodgy_uri?("j a v a s c r i p t : a l e r t ( ' i m g - o b - 2 ' ) ; ")
|
123
|
+
assert HtmlCleaner.dodgy_uri?("javascript")
|
124
|
+
assert HtmlCleaner.dodgy_uri?("javascript")
|
125
|
+
|
126
|
+
# url-encoded
|
127
|
+
assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
|
128
|
+
|
129
|
+
# Other evil schemes
|
130
|
+
assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
|
131
|
+
assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
|
132
|
+
assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
|
133
|
+
assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
|
134
|
+
|
135
|
+
# Various non-printing chars
|
136
|
+
assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
|
137
|
+
assert HtmlCleaner.dodgy_uri?("  javascript:foo()")
|
138
|
+
assert HtmlCleaner.dodgy_uri?("jav
ascript:foo()")
|
139
|
+
assert HtmlCleaner.dodgy_uri?("jav	ascript:foo()")
|
140
|
+
assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
|
141
|
+
|
142
|
+
# The Good
|
143
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
|
144
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
|
145
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
146
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
147
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
148
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=yVa=b")
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
metadata
CHANGED
@@ -3,16 +3,16 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: feed-normalizer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2006-
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2006-11-29 00:00:00 -08:00
|
8
8
|
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: andy@tinnedfruit.org
|
12
|
-
homepage: http://
|
13
|
-
rubyforge_project:
|
14
|
-
description:
|
15
|
-
autorequire:
|
12
|
+
homepage: http://feed-normalizer.rubyforge.org/
|
13
|
+
rubyforge_project: feed-normalizer
|
14
|
+
description: An extensible Ruby wrapper for Atom and RSS parsers. Feed normalizer wraps various RSS and Atom parsers, and returns a single unified object graph, regardless of the underlying feed format.
|
15
|
+
autorequire:
|
16
16
|
default_executable:
|
17
17
|
bindir: bin
|
18
18
|
has_rdoc: true
|
@@ -28,23 +28,27 @@ cert_chain:
|
|
28
28
|
authors:
|
29
29
|
- Andrew A. Smith
|
30
30
|
files:
|
31
|
+
- History.txt
|
32
|
+
- License.txt
|
33
|
+
- Manifest.txt
|
34
|
+
- Rakefile
|
35
|
+
- Readme.txt
|
31
36
|
- lib/feed-normalizer.rb
|
32
|
-
- lib/
|
33
|
-
- lib/structures.rb
|
37
|
+
- lib/html-cleaner.rb
|
34
38
|
- lib/parsers/rss.rb
|
35
39
|
- lib/parsers/simple-rss.rb
|
36
|
-
-
|
37
|
-
- test/data
|
40
|
+
- lib/structures.rb
|
38
41
|
- test/data/atom03.xml
|
39
42
|
- test/data/atom10.xml
|
40
43
|
- test/data/rdf10.xml
|
41
44
|
- test/data/rss20.xml
|
42
|
-
-
|
43
|
-
-
|
44
|
-
-
|
45
|
-
-
|
46
|
-
|
47
|
-
|
45
|
+
- test/data/rss20diff.xml
|
46
|
+
- test/data/rss20diff_short.xml
|
47
|
+
- test/test_all.rb
|
48
|
+
- test/test_feednormalizer.rb
|
49
|
+
- test/test_htmlcleaner.rb
|
50
|
+
test_files:
|
51
|
+
- test/test_all.rb
|
48
52
|
rdoc_options: []
|
49
53
|
|
50
54
|
extra_rdoc_files: []
|
@@ -56,6 +60,15 @@ extensions: []
|
|
56
60
|
requirements: []
|
57
61
|
|
58
62
|
dependencies:
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: hoe
|
65
|
+
version_requirement:
|
66
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: 1.1.6
|
71
|
+
version:
|
59
72
|
- !ruby/object:Gem::Dependency
|
60
73
|
name: simple-rss
|
61
74
|
version_requirement:
|
@@ -65,3 +78,12 @@ dependencies:
|
|
65
78
|
- !ruby/object:Gem::Version
|
66
79
|
version: "1.1"
|
67
80
|
version:
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: hpricot
|
83
|
+
version_requirement:
|
84
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: "0.4"
|
89
|
+
version:
|
data/RELEASE
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
Steps to create a release:
|
2
|
-
|
3
|
-
o Update Rakefile version
|
4
|
-
o Create package
|
5
|
-
rake clobber
|
6
|
-
rake package
|
7
|
-
o Tag release
|
8
|
-
svn copy https://feed-normalizer.googlecode.com/svn/trunk \
|
9
|
-
https://feed-normalizer.googlecode.com/svn/tags/RELEASE_[MAJOR]_[MINOR]_[REVISION]
|
10
|
-
o Upload gem and zip to rubyforge
|
11
|
-
o Update RAA
|
12
|
-
o Post rubyforge news
|
13
|
-
|
data/test/base_test.rb
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
-
|
3
|
-
require 'test/unit'
|
4
|
-
require 'feed-normalizer'
|
5
|
-
|
6
|
-
include FeedNormalizer
|
7
|
-
|
8
|
-
class BaseTest < Test::Unit::TestCase
|
9
|
-
|
10
|
-
XML_FILES = {}
|
11
|
-
|
12
|
-
def setup
|
13
|
-
data_dir = File.dirname(__FILE__) + '/data'
|
14
|
-
|
15
|
-
# Load up the xml files
|
16
|
-
Dir.open(data_dir).each do |fn|
|
17
|
-
next unless fn =~ /[.]xml$/
|
18
|
-
XML_FILES[fn.scan(/(.*)[.]/).to_s.to_sym] = File.read(data_dir + "/#{fn}")
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
def test_basic_parse
|
24
|
-
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_force_parser
|
28
|
-
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], RubyRssParser, true)
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_force_parser_exclusive
|
32
|
-
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], RubyRssParser, false)
|
33
|
-
end
|
34
|
-
|
35
|
-
def test_ruby_rss_parser
|
36
|
-
assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], RubyRssParser, false)
|
37
|
-
assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], RubyRssParser, false)
|
38
|
-
end
|
39
|
-
|
40
|
-
def test_simple_rss_parser
|
41
|
-
assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], SimpleRssParser, false)
|
42
|
-
assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], SimpleRssParser, false)
|
43
|
-
end
|
44
|
-
|
45
|
-
# Attempts to parse a feed that Ruby's RSS can't handle.
|
46
|
-
# SimpleRSS should provide the parsed feed.
|
47
|
-
def test_parser_failover_order
|
48
|
-
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
49
|
-
end
|
50
|
-
|
51
|
-
def test_all_parsers_fail
|
52
|
-
assert_nil FeedNormalizer::FeedNormalizer.parse("This isn't RSS or Atom!")
|
53
|
-
end
|
54
|
-
|
55
|
-
def test_correct_parser_used
|
56
|
-
assert_equal RSS::Parser, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]).parser
|
57
|
-
assert_equal SimpleRSS, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]).parser
|
58
|
-
end
|
59
|
-
|
60
|
-
def test_sanity_check
|
61
|
-
XML_FILES.keys.each do |xml_file|
|
62
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
|
63
|
-
|
64
|
-
assert [feed.title, feed.url, feed.entries.first.url].collect{|e| e.is_a?(String)}.all?, "Not everything was a String in #{xml_file}"
|
65
|
-
assert [feed.parser, feed.class].collect{|e| e.is_a?(Class)}.all?
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
def test_feed_equality
|
70
|
-
assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
71
|
-
assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
72
|
-
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
73
|
-
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
74
|
-
|
75
|
-
XML_FILES.keys.each do |xml_file|
|
76
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
|
77
|
-
assert_equal feed, Marshal.load(Marshal.dump(feed))
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|