feed-normalizer 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,155 +1,156 @@
1
- require 'test/unit'
2
- require 'html-cleaner'
3
-
4
- include FeedNormalizer
5
-
6
- class HtmlCleanerTest < Test::Unit::TestCase
7
-
8
- def test_unescape
9
- assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
10
- assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
11
- assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
12
- assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
13
- end
14
-
15
- def test_add_entities
16
- assert_equal "", HtmlCleaner.add_entities(nil)
17
- assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
18
- assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
19
- assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
20
- assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
21
- assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
22
- end
23
-
24
- def test_html_clean
25
- assert_equal "", HtmlCleaner.clean("")
26
-
27
- assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
28
- assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
29
-
30
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
31
- assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
32
-
33
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
34
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
35
-
36
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
37
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
38
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
39
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
40
-
41
- assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
42
-
43
- assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
44
- assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
45
-
46
- assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
47
- assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
48
- assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
49
- assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
50
- assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
51
-
52
- assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
53
-
54
- assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
55
- assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
56
-
57
- assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
58
- assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
59
-
60
- assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
61
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
62
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
63
-
64
- # Real-world examples from selected feeds
65
- assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
66
-
67
- assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
68
- HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
69
-
70
- assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
71
- HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
72
-
73
-
74
- # Various exploits from the past
75
- assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
76
- assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
77
- HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
78
- assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
79
- assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
80
- assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
81
-
82
- # This doesnt come out as I would like, but the result is still safe.
83
- # (Apparently, this would work in Gecko.)
84
- assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
85
- assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
86
-
87
- assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
88
- assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
89
- assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
90
-
91
- assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
92
- end
93
-
94
- def test_html_flatten
95
- assert_equal "", HtmlCleaner.flatten("")
96
-
97
- assert_equal "hello", HtmlCleaner.flatten("hello")
98
- assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
99
-
100
- assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
101
- assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
102
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
103
-
104
- assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
105
-
106
- assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
107
- assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
108
-
109
- assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
110
- end
111
-
112
- def test_dodgy_uri
113
- # All of these javascript urls work in IE6.
114
- assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
115
- assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
116
- assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
117
- assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
118
-
119
- # entities lacking ending ';'
120
- # This only works if they're all packed together without spacing.
121
- assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
122
- assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
123
- # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
124
- assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
125
- assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
126
- assert HtmlCleaner.dodgy_uri?("&#106avascript")
127
- assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
128
-
129
- # url-encoded
130
- assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
131
-
132
- # Other evil schemes
133
- assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
134
- assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
135
- assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
136
- assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
137
-
138
- # Various non-printing chars
139
- assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
140
- assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
141
- assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
142
- assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
143
- assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
144
-
145
- # The Good
146
- assert_nil HtmlCleaner.dodgy_uri?(nil)
147
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
148
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
149
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
150
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
151
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
152
- assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
153
- end
154
-
155
- end
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '../lib')))
2
+ require 'test/unit'
3
+ require 'html-cleaner'
4
+
5
+ include FeedNormalizer
6
+
7
+ class HtmlCleanerTest < Test::Unit::TestCase
8
+
9
+ def test_unescape
10
+ assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
11
+ assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
12
+ assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
13
+ assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
14
+ end
15
+
16
+ def test_add_entities
17
+ assert_equal "", HtmlCleaner.add_entities(nil)
18
+ assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
19
+ assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
20
+ assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
21
+ assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
22
+ assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
23
+ end
24
+
25
+ def test_html_clean
26
+ assert_equal "", HtmlCleaner.clean("")
27
+
28
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
29
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
30
+
31
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
32
+ assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
33
+
34
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
35
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
36
+
37
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
38
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
39
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
40
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
41
+
42
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
43
+
44
+ assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
45
+ assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
46
+
47
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
48
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
49
+ assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
50
+ assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
51
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
52
+
53
+ assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
54
+
55
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
56
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
57
+
58
+ assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
59
+ assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
60
+
61
+ assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
62
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
63
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
64
+
65
+ # Real-world examples from selected feeds
66
+ assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
67
+
68
+ assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
69
+ HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
70
+
71
+ assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
72
+ HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
73
+
74
+
75
+ # Various exploits from the past
76
+ assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
77
+ assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
78
+ HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
79
+ assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
80
+ assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
81
+ assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
82
+
83
+ # This doesnt come out as I would like, but the result is still safe.
84
+ # (Apparently, this would work in Gecko.)
85
+ assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
86
+ assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
87
+
88
+ assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
89
+ assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
90
+ assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
91
+
92
+ assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
93
+ end
94
+
95
+ def test_html_flatten
96
+ assert_equal "", HtmlCleaner.flatten("")
97
+
98
+ assert_equal "hello", HtmlCleaner.flatten("hello")
99
+ assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
100
+
101
+ assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
102
+ assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
103
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
104
+
105
+ assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
106
+
107
+ assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
108
+ assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
109
+
110
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
111
+ end
112
+
113
+ def test_dodgy_uri
114
+ # All of these javascript urls work in IE6.
115
+ assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
116
+ assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
117
+ assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
118
+ assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
119
+
120
+ # entities lacking ending ';'
121
+ # This only works if they're all packed together without spacing.
122
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
123
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
124
+ # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
125
+ assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
126
+ assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
127
+ assert HtmlCleaner.dodgy_uri?("&#106avascript")
128
+ assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
129
+
130
+ # url-encoded
131
+ assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
132
+
133
+ # Other evil schemes
134
+ assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
135
+ assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
136
+ assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
137
+ assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
138
+
139
+ # Various non-printing chars
140
+ assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
141
+ assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
142
+ assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
143
+ assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
144
+ assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
145
+
146
+ # The Good
147
+ assert_nil HtmlCleaner.dodgy_uri?(nil)
148
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
149
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
150
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
151
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
152
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
153
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
154
+ end
155
+
156
+ end
metadata CHANGED
@@ -1,74 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: feed-normalizer
5
3
  version: !ruby/object:Gem::Version
6
- version: 1.5.1
7
- date: 2008-02-06 00:00:00 -08:00
8
- summary: Extensible Ruby wrapper for Atom and RSS parsers
9
- require_paths:
10
- - lib
11
- email: andy@tinnedfruit.org
12
- homepage: http://feed-normalizer.rubyforge.org/
13
- rubyforge_project: feed-normalizer
14
- description: An extensible Ruby wrapper for Atom and RSS parsers. Feed normalizer wraps various RSS and Atom parsers, and returns a single unified object graph, regardless of the underlying feed format.
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 1.5.2
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Andrew A. Smith
31
- files:
32
- - History.txt
33
- - License.txt
34
- - Manifest.txt
35
- - Rakefile
36
- - README.txt
37
- - lib/feed-normalizer.rb
38
- - lib/html-cleaner.rb
39
- - lib/parsers/rss.rb
40
- - lib/parsers/simple-rss.rb
41
- - lib/structures.rb
42
- - test/data/atom03.xml
43
- - test/data/atom10.xml
44
- - test/data/rdf10.xml
45
- - test/data/rss20.xml
46
- - test/data/rss20diff.xml
47
- - test/data/rss20diff_short.xml
48
- - test/test_all.rb
49
- - test/test_feednormalizer.rb
50
- - test/test_htmlcleaner.rb
51
- test_files:
52
- - test/test_all.rb
53
- rdoc_options:
54
- - --main
55
- - README.txt
56
- extra_rdoc_files:
57
- - History.txt
58
- - License.txt
59
- - Manifest.txt
60
- - README.txt
61
- executables: []
62
-
63
- extensions: []
64
-
65
- requirements: []
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
66
11
 
12
+ date: 2010-01-25 00:00:00 -08:00
13
+ default_executable:
67
14
  dependencies:
68
15
  - !ruby/object:Gem::Dependency
69
16
  name: simple-rss
17
+ type: :runtime
70
18
  version_requirement:
71
- version_requirements: !ruby/object:Gem::Version::Requirement
19
+ version_requirements: !ruby/object:Gem::Requirement
72
20
  requirements:
73
21
  - - ">="
74
22
  - !ruby/object:Gem::Version
@@ -76,19 +24,107 @@ dependencies:
76
24
  version:
77
25
  - !ruby/object:Gem::Dependency
78
26
  name: hpricot
27
+ type: :runtime
79
28
  version_requirement:
80
- version_requirements: !ruby/object:Gem::Version::Requirement
29
+ version_requirements: !ruby/object:Gem::Requirement
81
30
  requirements:
82
31
  - - ">="
83
32
  - !ruby/object:Gem::Version
84
33
  version: "0.6"
85
34
  version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rubyforge
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 2.0.3
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: gemcutter
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.3.0
54
+ version:
86
55
  - !ruby/object:Gem::Dependency
87
56
  name: hoe
57
+ type: :development
88
58
  version_requirement:
89
- version_requirements: !ruby/object:Gem::Version::Requirement
59
+ version_requirements: !ruby/object:Gem::Requirement
90
60
  requirements:
91
61
  - - ">="
92
62
  - !ruby/object:Gem::Version
93
- version: 1.5.0
63
+ version: 2.5.0
94
64
  version:
65
+ description: |-
66
+ An extensible Ruby wrapper for Atom and RSS parsers.
67
+
68
+ Feed normalizer wraps various RSS and Atom parsers, and returns a single unified
69
+ object graph, regardless of the underlying feed format.
70
+ email: andy@tinnedfruit.org
71
+ executables: []
72
+
73
+ extensions: []
74
+
75
+ extra_rdoc_files:
76
+ - History.txt
77
+ - License.txt
78
+ - Manifest.txt
79
+ - README.txt
80
+ files:
81
+ - History.txt
82
+ - License.txt
83
+ - Manifest.txt
84
+ - Rakefile
85
+ - README.txt
86
+ - lib/feed-normalizer.rb
87
+ - lib/html-cleaner.rb
88
+ - lib/parsers/rss.rb
89
+ - lib/parsers/simple-rss.rb
90
+ - lib/structures.rb
91
+ - test/data/atom03.xml
92
+ - test/data/atom10.xml
93
+ - test/data/rdf10.xml
94
+ - test/data/rss20.xml
95
+ - test/data/rss20diff.xml
96
+ - test/data/rss20diff_short.xml
97
+ - test/test_feednormalizer.rb
98
+ - test/test_htmlcleaner.rb
99
+ has_rdoc: true
100
+ homepage: http://github.com/aasmith/feed-normalizer
101
+ licenses: []
102
+
103
+ post_install_message:
104
+ rdoc_options:
105
+ - --main
106
+ - README.txt
107
+ require_paths:
108
+ - lib
109
+ required_ruby_version: !ruby/object:Gem::Requirement
110
+ requirements:
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: "0"
114
+ version:
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: "0"
120
+ version:
121
+ requirements: []
122
+
123
+ rubyforge_project: feed-normalizer
124
+ rubygems_version: 1.3.5
125
+ signing_key:
126
+ specification_version: 3
127
+ summary: Extensible Ruby wrapper for Atom and RSS parsers
128
+ test_files:
129
+ - test/test_feednormalizer.rb
130
+ - test/test_htmlcleaner.rb