mikowitz-feed-normalizer 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,155 @@
1
+ require 'test/unit'
2
+ require 'html-cleaner'
3
+
4
+ include FeedNormalizer
5
+
6
+ class HtmlCleanerTest < Test::Unit::TestCase
7
+
8
+ def test_unescape
9
+ assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
10
+ assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
11
+ assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
12
+ assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
13
+ end
14
+
15
+ def test_add_entities
16
+ assert_equal "", HtmlCleaner.add_entities(nil)
17
+ assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
18
+ assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
19
+ assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
20
+ assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
21
+ assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
22
+ end
23
+
24
+ def test_html_clean
25
+ assert_equal "", HtmlCleaner.clean("")
26
+
27
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
28
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
29
+
30
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
31
+ assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
32
+
33
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
34
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
35
+
36
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
37
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
38
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
39
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
40
+
41
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
42
+
43
+ assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
44
+ assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
45
+
46
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
47
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
48
+ assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
49
+ assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
50
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
51
+
52
+ assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
53
+
54
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
55
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
56
+
57
+ assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
58
+ assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
59
+
60
+ assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
61
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
62
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
63
+
64
+ # Real-world examples from selected feeds
65
+ assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
66
+
67
+ assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
68
+ HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
69
+
70
+ assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
71
+ HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
72
+
73
+
74
+ # Various exploits from the past
75
+ assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
76
+ assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
77
+ HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
78
+ assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
79
+ assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
80
+ assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
81
+
82
+ # This doesnt come out as I would like, but the result is still safe.
83
+ # (Apparently, this would work in Gecko.)
84
+ assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
85
+ assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
86
+
87
+ assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
88
+ assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
89
+ assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
90
+
91
+ assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
92
+ end
93
+
94
+ def test_html_flatten
95
+ assert_equal "", HtmlCleaner.flatten("")
96
+
97
+ assert_equal "hello", HtmlCleaner.flatten("hello")
98
+ assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
99
+
100
+ assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
101
+ assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
102
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
103
+
104
+ assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
105
+
106
+ assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
107
+ assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
108
+
109
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
110
+ end
111
+
112
+ def test_dodgy_uri
113
+ # All of these javascript urls work in IE6.
114
+ assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
115
+ assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
116
+ assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
117
+ assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
118
+
119
+ # entities lacking ending ';'
120
+ # This only works if they're all packed together without spacing.
121
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
122
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
123
+ # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
124
+ assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
125
+ assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
126
+ assert HtmlCleaner.dodgy_uri?("&#106avascript")
127
+ assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
128
+
129
+ # url-encoded
130
+ assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
131
+
132
+ # Other evil schemes
133
+ assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
134
+ assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
135
+ assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
136
+ assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
137
+
138
+ # Various non-printing chars
139
+ assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
140
+ assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
141
+ assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
142
+ assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
143
+ assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
144
+
145
+ # The Good
146
+ assert_nil HtmlCleaner.dodgy_uri?(nil)
147
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
148
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
149
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
150
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
151
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
152
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
153
+ end
154
+
155
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mikowitz-feed-normalizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.5.1
5
+ platform: ruby
6
+ authors:
7
+ - Andrew A. Smith
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-10-10 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: simple-rss
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "1.1"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hpricot
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: "0.6"
32
+ version:
33
+ - !ruby/object:Gem::Dependency
34
+ name: hoe
35
+ version_requirement:
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.7.0
41
+ version:
42
+ description: An extensible Ruby wrapper for Atom and RSS parsers. Feed normalizer wraps various RSS and Atom parsers, and returns a single unified object graph, regardless of the underlying feed format.
43
+ email: andy@tinnedfruit.org
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ extra_rdoc_files:
49
+ - History.txt
50
+ - License.txt
51
+ - Manifest.txt
52
+ - README.txt
53
+ files:
54
+ - History.txt
55
+ - License.txt
56
+ - Manifest.txt
57
+ - Rakefile
58
+ - README.txt
59
+ - lib/feed-normalizer.rb
60
+ - lib/html-cleaner.rb
61
+ - lib/parsers/rss.rb
62
+ - lib/parsers/simple-rss.rb
63
+ - lib/structures.rb
64
+ - test/data/atom03.xml
65
+ - test/data/atom10.xml
66
+ - test/data/rdf10.xml
67
+ - test/data/rss20.xml
68
+ - test/data/rss20diff.xml
69
+ - test/data/rss20diff_short.xml
70
+ - test/test_all.rb
71
+ - test/test_feednormalizer.rb
72
+ - test/test_htmlcleaner.rb
73
+ has_rdoc: true
74
+ homepage: http://feed-normalizer.rubyforge.org/
75
+ post_install_message:
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project: feed-normalizer
96
+ rubygems_version: 1.2.0
97
+ signing_key:
98
+ specification_version: 2
99
+ summary: Extensible Ruby wrapper for Atom and RSS parsers
100
+ test_files:
101
+ - test/test_all.rb