mikowitz-feed-normalizer 1.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +52 -0
- data/License.txt +27 -0
- data/Manifest.txt +20 -0
- data/README.txt +63 -0
- data/Rakefile +25 -0
- data/lib/feed-normalizer.rb +149 -0
- data/lib/html-cleaner.rb +190 -0
- data/lib/parsers/rss.rb +102 -0
- data/lib/parsers/simple-rss.rb +138 -0
- data/lib/structures.rb +244 -0
- data/test/data/atom03.xml +128 -0
- data/test/data/atom10.xml +114 -0
- data/test/data/rdf10.xml +1498 -0
- data/test/data/rss20.xml +64 -0
- data/test/data/rss20diff.xml +59 -0
- data/test/data/rss20diff_short.xml +51 -0
- data/test/test_all.rb +6 -0
- data/test/test_feednormalizer.rb +267 -0
- data/test/test_htmlcleaner.rb +155 -0
- metadata +101 -0
@@ -0,0 +1,155 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'html-cleaner'
|
3
|
+
|
4
|
+
include FeedNormalizer
|
5
|
+
|
6
|
+
class HtmlCleanerTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_unescape
|
9
|
+
assert_equal "' ' °", FeedNormalizer::HtmlCleaner.unescapeHTML("' ' °")
|
10
|
+
assert_equal "\" °", FeedNormalizer::HtmlCleaner.unescapeHTML("" °")
|
11
|
+
assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("""""")
|
12
|
+
assert_equal "heavily subnet’d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet’d network,")
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_add_entities
|
16
|
+
assert_equal "", HtmlCleaner.add_entities(nil)
|
17
|
+
assert_equal "x > y", HtmlCleaner.add_entities("x > y")
|
18
|
+
assert_equal "1 & 2", HtmlCleaner.add_entities("1 & 2")
|
19
|
+
assert_equal "& { ´ ģ", HtmlCleaner.add_entities("& { ´ ģ")
|
20
|
+
assert_equal "& { &ACUTE; ሺ ࠏ", HtmlCleaner.add_entities("& { &ACUTE; ሺ ࠏ")
|
21
|
+
assert_equal "heavily subnet’d network,", HtmlCleaner.add_entities("heavily subnet’d network,")
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_html_clean
|
25
|
+
assert_equal "", HtmlCleaner.clean("")
|
26
|
+
|
27
|
+
assert_equal "<p>foo > *</p>", HtmlCleaner.clean("<p>foo > *</p>")
|
28
|
+
assert_equal "<p>foo > *</p>", HtmlCleaner.clean("<p>foo > *</p>")
|
29
|
+
|
30
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
|
31
|
+
assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
|
32
|
+
|
33
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
|
34
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
|
35
|
+
|
36
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
|
37
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
|
38
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
|
39
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
|
40
|
+
|
41
|
+
assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
|
42
|
+
|
43
|
+
assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
|
44
|
+
assert_equal "<a href=\"http://example.org/proc?a&b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
|
45
|
+
|
46
|
+
assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
|
47
|
+
assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
|
48
|
+
assert_equal "<p>para</p><bo /dy><p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
|
49
|
+
assert_equal "<p>para</p><bo\\/dy><p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
|
50
|
+
assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
|
51
|
+
|
52
|
+
assert_equal "<p>one & two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
|
53
|
+
|
54
|
+
assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
|
55
|
+
assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
|
56
|
+
|
57
|
+
assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
|
58
|
+
assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
|
59
|
+
|
60
|
+
assert_equal "what's new", HtmlCleaner.clean("what's new")
|
61
|
+
assert_equal ""what's new?"", HtmlCleaner.clean("\"what's new?\"")
|
62
|
+
assert_equal ""what's new?"", HtmlCleaner.clean(""what's new?"")
|
63
|
+
|
64
|
+
# Real-world examples from selected feeds
|
65
|
+
assert_equal "I have a heavily subnet’d/vlan’d network,", HtmlCleaner.clean("I have a heavily subnet’d/vlan’d network,")
|
66
|
+
|
67
|
+
assert_equal "<pre><blockquote><%= start_form_tag :action => "create" %></blockquote></pre>",
|
68
|
+
HtmlCleaner.clean("<pre><blockquote><%= start_form_tag :action => \"create\" %></blockquote></pre>")
|
69
|
+
|
70
|
+
assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
|
71
|
+
HtmlCleaner.clean("<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>")
|
72
|
+
|
73
|
+
|
74
|
+
# Various exploits from the past
|
75
|
+
assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
|
76
|
+
assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>",
|
77
|
+
HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
|
78
|
+
assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
|
79
|
+
assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
|
80
|
+
assert_equal "<IMG """>">", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
|
81
|
+
|
82
|
+
# This doesnt come out as I would like, but the result is still safe.
|
83
|
+
# (Apparently, this would work in Gecko.)
|
84
|
+
assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
|
85
|
+
assert_equal "<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js">", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
|
86
|
+
|
87
|
+
assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
|
88
|
+
assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
|
89
|
+
assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
|
90
|
+
|
91
|
+
assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_html_flatten
|
95
|
+
assert_equal "", HtmlCleaner.flatten("")
|
96
|
+
|
97
|
+
assert_equal "hello", HtmlCleaner.flatten("hello")
|
98
|
+
assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
|
99
|
+
|
100
|
+
assert_equal "A > B : C", HtmlCleaner.flatten("A > B : C")
|
101
|
+
assert_equal "what's new", HtmlCleaner.flatten("what's new")
|
102
|
+
assert_equal ""what's new?"", HtmlCleaner.flatten("\"what's new?\"")
|
103
|
+
|
104
|
+
assert_equal "we’ve got <a hre", HtmlCleaner.flatten("we’ve got <a hre")
|
105
|
+
|
106
|
+
assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
|
107
|
+
assert_equal "http://example.org/proc?a&b", HtmlCleaner.flatten("http://example.org/proc?a&b")
|
108
|
+
|
109
|
+
assert_equal ""what's new?"", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what's new?\""))
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_dodgy_uri
|
113
|
+
# All of these javascript urls work in IE6.
|
114
|
+
assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
|
115
|
+
assert HtmlCleaner.dodgy_uri?(" javascript \n :alert('HI');")
|
116
|
+
assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
|
117
|
+
assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
|
118
|
+
|
119
|
+
# entities lacking ending ';'
|
120
|
+
# This only works if they're all packed together without spacing.
|
121
|
+
assert HtmlCleaner.dodgy_uri?("javascript:alert('img-ob-2')")
|
122
|
+
assert HtmlCleaner.dodgy_uri?("javascript:alert('img-ob-2' ) ; ")
|
123
|
+
# catch extra spacing anyway.. support for this is possible, depending where the spaces are.
|
124
|
+
assert HtmlCleaner.dodgy_uri?("j a v a s c r i p t : a l e r t ( ' i m g - o b - 2 ' ) ; ")
|
125
|
+
assert HtmlCleaner.dodgy_uri?("j a v a s c r i p t : a l e r t ( ' i m g - o b - 2 ' ) ; ")
|
126
|
+
assert HtmlCleaner.dodgy_uri?("javascript")
|
127
|
+
assert HtmlCleaner.dodgy_uri?("javascript")
|
128
|
+
|
129
|
+
# url-encoded
|
130
|
+
assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
|
131
|
+
|
132
|
+
# Other evil schemes
|
133
|
+
assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
|
134
|
+
assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
|
135
|
+
assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
|
136
|
+
assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
|
137
|
+
|
138
|
+
# Various non-printing chars
|
139
|
+
assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
|
140
|
+
assert HtmlCleaner.dodgy_uri?("  javascript:foo()")
|
141
|
+
assert HtmlCleaner.dodgy_uri?("jav
ascript:foo()")
|
142
|
+
assert HtmlCleaner.dodgy_uri?("jav	ascript:foo()")
|
143
|
+
assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
|
144
|
+
|
145
|
+
# The Good
|
146
|
+
assert_nil HtmlCleaner.dodgy_uri?(nil)
|
147
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
|
148
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
|
149
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
150
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
151
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
152
|
+
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=yVa=b")
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mikowitz-feed-normalizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.5.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew A. Smith
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-10-10 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: simple-rss
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "1.1"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: hpricot
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: "0.6"
|
32
|
+
version:
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: hoe
|
35
|
+
version_requirement:
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.7.0
|
41
|
+
version:
|
42
|
+
description: An extensible Ruby wrapper for Atom and RSS parsers. Feed normalizer wraps various RSS and Atom parsers, and returns a single unified object graph, regardless of the underlying feed format.
|
43
|
+
email: andy@tinnedfruit.org
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
extra_rdoc_files:
|
49
|
+
- History.txt
|
50
|
+
- License.txt
|
51
|
+
- Manifest.txt
|
52
|
+
- README.txt
|
53
|
+
files:
|
54
|
+
- History.txt
|
55
|
+
- License.txt
|
56
|
+
- Manifest.txt
|
57
|
+
- Rakefile
|
58
|
+
- README.txt
|
59
|
+
- lib/feed-normalizer.rb
|
60
|
+
- lib/html-cleaner.rb
|
61
|
+
- lib/parsers/rss.rb
|
62
|
+
- lib/parsers/simple-rss.rb
|
63
|
+
- lib/structures.rb
|
64
|
+
- test/data/atom03.xml
|
65
|
+
- test/data/atom10.xml
|
66
|
+
- test/data/rdf10.xml
|
67
|
+
- test/data/rss20.xml
|
68
|
+
- test/data/rss20diff.xml
|
69
|
+
- test/data/rss20diff_short.xml
|
70
|
+
- test/test_all.rb
|
71
|
+
- test/test_feednormalizer.rb
|
72
|
+
- test/test_htmlcleaner.rb
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://feed-normalizer.rubyforge.org/
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
requirements: []
|
94
|
+
|
95
|
+
rubyforge_project: feed-normalizer
|
96
|
+
rubygems_version: 1.2.0
|
97
|
+
signing_key:
|
98
|
+
specification_version: 2
|
99
|
+
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
100
|
+
test_files:
|
101
|
+
- test/test_all.rb
|