feed-normalizer 1.5.1 → 1.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,155 +1,156 @@
1
- require 'test/unit'
2
- require 'html-cleaner'
3
-
4
- include FeedNormalizer
5
-
6
- class HtmlCleanerTest < Test::Unit::TestCase
7
-
8
- def test_unescape
9
- assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
10
- assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
11
- assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
12
- assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
13
- end
14
-
15
- def test_add_entities
16
- assert_equal "", HtmlCleaner.add_entities(nil)
17
- assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
18
- assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
19
- assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
20
- assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
21
- assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
22
- end
23
-
24
- def test_html_clean
25
- assert_equal "", HtmlCleaner.clean("")
26
-
27
- assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
28
- assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
29
-
30
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
31
- assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
32
-
33
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
34
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
35
-
36
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
37
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
38
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
39
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
40
-
41
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
42
-
43
- assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
44
- assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
45
-
46
- assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
47
- assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
48
- assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
49
- assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
50
- assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
51
-
52
- assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
53
-
54
- assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
55
- assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
56
-
57
- assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
58
- assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
59
-
60
- assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
61
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
62
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
63
-
64
- # Real-world examples from selected feeds
65
- assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
66
-
67
- assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
68
- HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
69
-
70
- assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
71
- HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
72
-
73
-
74
- # Various exploits from the past
75
- assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
76
- assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
77
- HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
78
- assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
79
- assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
80
- assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
81
-
82
- # This doesnt come out as I would like, but the result is still safe.
83
- # (Apparently, this would work in Gecko.)
84
- assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
85
- assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
86
-
87
- assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
88
- assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
89
- assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
90
-
91
- assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
92
- end
93
-
94
- def test_html_flatten
95
- assert_equal "", HtmlCleaner.flatten("")
96
-
97
- assert_equal "hello", HtmlCleaner.flatten("hello")
98
- assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
99
-
100
- assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
101
- assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
102
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
103
-
104
- assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
105
-
106
- assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
107
- assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
108
-
109
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
110
- end
111
-
112
- def test_dodgy_uri
113
- # All of these javascript urls work in IE6.
114
- assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
115
- assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
116
- assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
117
- assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
118
-
119
- # entities lacking ending ';'
120
- # This only works if they're all packed together without spacing.
121
- assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
122
- assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
123
- # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
124
- assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
125
- assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
126
- assert HtmlCleaner.dodgy_uri?("&#106avascript")
127
- assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
128
-
129
- # url-encoded
130
- assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
131
-
132
- # Other evil schemes
133
- assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
134
- assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
135
- assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
136
- assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
137
-
138
- # Various non-printing chars
139
- assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
140
- assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
141
- assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
142
- assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
143
- assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
144
-
145
- # The Good
146
- assert_nil HtmlCleaner.dodgy_uri?(nil)
147
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
148
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
149
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
150
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
151
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
152
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
153
- end
154
-
155
- end
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '../lib')))
2
+ require 'test/unit'
3
+ require 'html-cleaner'
4
+
5
+ include FeedNormalizer
6
+
7
+ class HtmlCleanerTest < Test::Unit::TestCase
8
+
9
+ def test_unescape
10
+ assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
11
+ assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
12
+ assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
13
+ assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
14
+ end
15
+
16
+ def test_add_entities
17
+ assert_equal "", HtmlCleaner.add_entities(nil)
18
+ assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
19
+ assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
20
+ assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
21
+ assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
22
+ assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
23
+ end
24
+
25
+ def test_html_clean
26
+ assert_equal "", HtmlCleaner.clean("")
27
+
28
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
29
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
30
+
31
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
32
+ assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
33
+
34
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
35
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
36
+
37
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
38
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
39
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
40
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
41
+
42
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
43
+
44
+ assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
45
+ assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
46
+
47
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
48
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
49
+ assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
50
+ assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
51
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
52
+
53
+ assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
54
+
55
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
56
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
57
+
58
+ assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
59
+ assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
60
+
61
+ assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
62
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
63
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
64
+
65
+ # Real-world examples from selected feeds
66
+ assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
67
+
68
+ assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
69
+ HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
70
+
71
+ assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
72
+ HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
73
+
74
+
75
+ # Various exploits from the past
76
+ assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
77
+ assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
78
+ HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
79
+ assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
80
+ assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
81
+ assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
82
+
83
+ # This doesnt come out as I would like, but the result is still safe.
84
+ # (Apparently, this would work in Gecko.)
85
+ assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
86
+ assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
87
+
88
+ assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
89
+ assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
90
+ assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
91
+
92
+ assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
93
+ end
94
+
95
+ def test_html_flatten
96
+ assert_equal "", HtmlCleaner.flatten("")
97
+
98
+ assert_equal "hello", HtmlCleaner.flatten("hello")
99
+ assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
100
+
101
+ assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
102
+ assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
103
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
104
+
105
+ assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
106
+
107
+ assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
108
+ assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
109
+
110
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
111
+ end
112
+
113
+ def test_dodgy_uri
114
+ # All of these javascript urls work in IE6.
115
+ assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
116
+ assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
117
+ assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
118
+ assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
119
+
120
+ # entities lacking ending ';'
121
+ # This only works if they're all packed together without spacing.
122
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
123
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
124
+ # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
125
+ assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
126
+ assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
127
+ assert HtmlCleaner.dodgy_uri?("&#106avascript")
128
+ assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
129
+
130
+ # url-encoded
131
+ assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
132
+
133
+ # Other evil schemes
134
+ assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
135
+ assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
136
+ assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
137
+ assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
138
+
139
+ # Various non-printing chars
140
+ assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
141
+ assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
142
+ assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
143
+ assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
144
+ assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
145
+
146
+ # The Good
147
+ assert_nil HtmlCleaner.dodgy_uri?(nil)
148
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
149
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
150
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
151
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
152
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
153
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
154
+ end
155
+
156
+ end
metadata CHANGED
@@ -1,74 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: feed-normalizer
5
3
  version: !ruby/object:Gem::Version
6
- version: 1.5.1
7
- date: 2008-02-06 00:00:00 -08:00
8
- summary: Extensible Ruby wrapper for Atom and RSS parsers
9
- require_paths:
10
- - lib
11
- email: andy@tinnedfruit.org
12
- homepage: http://feed-normalizer.rubyforge.org/
13
- rubyforge_project: feed-normalizer
14
- description: An extensible Ruby wrapper for Atom and RSS parsers. Feed normalizer wraps various RSS and Atom parsers, and returns a single unified object graph, regardless of the underlying feed format.
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 1.5.2
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Andrew A. Smith
31
- files:
32
- - History.txt
33
- - License.txt
34
- - Manifest.txt
35
- - Rakefile
36
- - README.txt
37
- - lib/feed-normalizer.rb
38
- - lib/html-cleaner.rb
39
- - lib/parsers/rss.rb
40
- - lib/parsers/simple-rss.rb
41
- - lib/structures.rb
42
- - test/data/atom03.xml
43
- - test/data/atom10.xml
44
- - test/data/rdf10.xml
45
- - test/data/rss20.xml
46
- - test/data/rss20diff.xml
47
- - test/data/rss20diff_short.xml
48
- - test/test_all.rb
49
- - test/test_feednormalizer.rb
50
- - test/test_htmlcleaner.rb
51
- test_files:
52
- - test/test_all.rb
53
- rdoc_options:
54
- - --main
55
- - README.txt
56
- extra_rdoc_files:
57
- - History.txt
58
- - License.txt
59
- - Manifest.txt
60
- - README.txt
61
- executables: []
62
-
63
- extensions: []
64
-
65
- requirements: []
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
66
11
 
12
+ date: 2010-01-25 00:00:00 -08:00
13
+ default_executable:
67
14
  dependencies:
68
15
  - !ruby/object:Gem::Dependency
69
16
  name: simple-rss
17
+ type: :runtime
70
18
  version_requirement:
71
- version_requirements: !ruby/object:Gem::Version::Requirement
19
+ version_requirements: !ruby/object:Gem::Requirement
72
20
  requirements:
73
21
  - - ">="
74
22
  - !ruby/object:Gem::Version
@@ -76,19 +24,107 @@ dependencies:
76
24
  version:
77
25
  - !ruby/object:Gem::Dependency
78
26
  name: hpricot
27
+ type: :runtime
79
28
  version_requirement:
80
- version_requirements: !ruby/object:Gem::Version::Requirement
29
+ version_requirements: !ruby/object:Gem::Requirement
81
30
  requirements:
82
31
  - - ">="
83
32
  - !ruby/object:Gem::Version
84
33
  version: "0.6"
85
34
  version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rubyforge
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 2.0.3
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: gemcutter
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.3.0
54
+ version:
86
55
  - !ruby/object:Gem::Dependency
87
56
  name: hoe
57
+ type: :development
88
58
  version_requirement:
89
- version_requirements: !ruby/object:Gem::Version::Requirement
59
+ version_requirements: !ruby/object:Gem::Requirement
90
60
  requirements:
91
61
  - - ">="
92
62
  - !ruby/object:Gem::Version
93
- version: 1.5.0
63
+ version: 2.5.0
94
64
  version:
65
+ description: |-
66
+ An extensible Ruby wrapper for Atom and RSS parsers.
67
+
68
+ Feed normalizer wraps various RSS and Atom parsers, and returns a single unified
69
+ object graph, regardless of the underlying feed format.
70
+ email: andy@tinnedfruit.org
71
+ executables: []
72
+
73
+ extensions: []
74
+
75
+ extra_rdoc_files:
76
+ - History.txt
77
+ - License.txt
78
+ - Manifest.txt
79
+ - README.txt
80
+ files:
81
+ - History.txt
82
+ - License.txt
83
+ - Manifest.txt
84
+ - Rakefile
85
+ - README.txt
86
+ - lib/feed-normalizer.rb
87
+ - lib/html-cleaner.rb
88
+ - lib/parsers/rss.rb
89
+ - lib/parsers/simple-rss.rb
90
+ - lib/structures.rb
91
+ - test/data/atom03.xml
92
+ - test/data/atom10.xml
93
+ - test/data/rdf10.xml
94
+ - test/data/rss20.xml
95
+ - test/data/rss20diff.xml
96
+ - test/data/rss20diff_short.xml
97
+ - test/test_feednormalizer.rb
98
+ - test/test_htmlcleaner.rb
99
+ has_rdoc: true
100
+ homepage: http://github.com/aasmith/feed-normalizer
101
+ licenses: []
102
+
103
+ post_install_message:
104
+ rdoc_options:
105
+ - --main
106
+ - README.txt
107
+ require_paths:
108
+ - lib
109
+ required_ruby_version: !ruby/object:Gem::Requirement
110
+ requirements:
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: "0"
114
+ version:
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: "0"
120
+ version:
121
+ requirements: []
122
+
123
+ rubyforge_project: feed-normalizer
124
+ rubygems_version: 1.3.5
125
+ signing_key:
126
+ specification_version: 3
127
+ summary: Extensible Ruby wrapper for Atom and RSS parsers
128
+ test_files:
129
+ - test/test_feednormalizer.rb
130
+ - test/test_htmlcleaner.rb