openlogic-feed-normalizer 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '../lib')))
2
+ require 'test/unit'
3
+ require 'html-cleaner'
4
+
5
+ include FeedNormalizer
6
+
7
+ class HtmlCleanerTest < Test::Unit::TestCase
8
+
9
+ def test_unescape
10
+ assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
11
+ assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
12
+ assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
13
+ assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
14
+ end
15
+
16
+ def test_add_entities
17
+ assert_equal "", HtmlCleaner.add_entities(nil)
18
+ assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
19
+ assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
20
+ assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
21
+ assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
22
+ assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
23
+ end
24
+
25
+ def test_html_clean
26
+ assert_equal "", HtmlCleaner.clean("")
27
+
28
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
29
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
30
+
31
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
32
+ assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
33
+
34
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
35
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
36
+
37
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
38
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
39
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
40
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
41
+
42
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
43
+
44
+ assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
45
+ assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
46
+
47
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
48
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
49
+ assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
50
+ assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
51
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
52
+
53
+ assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
54
+
55
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
56
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
57
+
58
+ assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
59
+ assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
60
+
61
+ assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
62
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
63
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
64
+
65
+ # Real-world examples from selected feeds
66
+ assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
67
+
68
+ assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
69
+ HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
70
+
71
+ assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
72
+ HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
73
+
74
+
75
+ # Various exploits from the past
76
+ assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
77
+ assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
78
+ HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
79
+ assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
80
+ assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
81
+ assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
82
+
83
+ # This doesnt come out as I would like, but the result is still safe.
84
+ # (Apparently, this would work in Gecko.)
85
+ assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
86
+ assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
87
+
88
+ assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
89
+ assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
90
+ assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
91
+
92
+ assert_equal "<p>hello</p>", HtmlCleaner.clean("<p>h<!-- hoho -->ell<!-- hoho -->o</p>")
93
+ end
94
+
95
+ def test_html_flatten
96
+ assert_equal "", HtmlCleaner.flatten("")
97
+
98
+ assert_equal "hello", HtmlCleaner.flatten("hello")
99
+ assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
100
+
101
+ assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
102
+ assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
103
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
104
+
105
+ assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
106
+
107
+ assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
108
+ assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
109
+
110
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
111
+ end
112
+
113
+ def test_dodgy_uri
114
+ # All of these javascript urls work in IE6.
115
+ assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
116
+ assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
117
+ assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
118
+ assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
119
+
120
+ # entities lacking ending ';'
121
+ # This only works if they're all packed together without spacing.
122
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
123
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
124
+ # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
125
+ assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
126
+ assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
127
+ assert HtmlCleaner.dodgy_uri?("&#106avascript")
128
+ assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
129
+
130
+ # url-encoded
131
+ assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
132
+
133
+ # Other evil schemes
134
+ assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
135
+ assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
136
+ assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
137
+ assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
138
+
139
+ # Various non-printing chars
140
+ assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
141
+ assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
142
+ assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
143
+ assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
144
+ assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
145
+
146
+ # The Good
147
+ assert_nil HtmlCleaner.dodgy_uri?(nil)
148
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
149
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
150
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
151
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
152
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
153
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
154
+ end
155
+
156
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: openlogic-feed-normalizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.5.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew A. Smith
9
+ - Todd Thomas
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-02-29 00:00:00.000000000Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: simple-rss
17
+ requirement: &2152864020 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '1.1'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *2152864020
26
+ - !ruby/object:Gem::Dependency
27
+ name: hpricot
28
+ requirement: &2152863500 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: *2152863500
37
+ - !ruby/object:Gem::Dependency
38
+ name: rdoc
39
+ requirement: &2152863020 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ version: '3.10'
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *2152863020
48
+ - !ruby/object:Gem::Dependency
49
+ name: hoe
50
+ requirement: &2152862480 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: '2.14'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *2152862480
59
+ description: ! 'An extensible Ruby wrapper for Atom and RSS parsers.
60
+
61
+
62
+ Feed normalizer wraps various RSS and Atom parsers, and returns a single unified
63
+
64
+ object graph, regardless of the underlying feed format.'
65
+ email:
66
+ - andy@tinnedfruit.org
67
+ - todd.thomas@openlogic.com
68
+ executables: []
69
+ extensions: []
70
+ extra_rdoc_files:
71
+ - History.txt
72
+ - License.txt
73
+ - Manifest.txt
74
+ - README.txt
75
+ files:
76
+ - History.txt
77
+ - License.txt
78
+ - Manifest.txt
79
+ - Rakefile
80
+ - README.txt
81
+ - lib/feed-normalizer.rb
82
+ - lib/html-cleaner.rb
83
+ - lib/parsers/rss.rb
84
+ - lib/parsers/simple-rss.rb
85
+ - lib/structures.rb
86
+ - test/data/atom03.xml
87
+ - test/data/atom10.xml
88
+ - test/data/rdf10.xml
89
+ - test/data/rss20.xml
90
+ - test/data/rss20diff.xml
91
+ - test/data/rss20diff_short.xml
92
+ - test/test_feednormalizer.rb
93
+ - test/test_htmlcleaner.rb
94
+ - .gemtest
95
+ homepage: http://github.com/toddthomas/feed-normalizer
96
+ licenses: []
97
+ post_install_message:
98
+ rdoc_options:
99
+ - --main
100
+ - README.txt
101
+ require_paths:
102
+ - lib
103
+ required_ruby_version: !ruby/object:Gem::Requirement
104
+ none: false
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ required_rubygems_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ! '>='
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ requirements: []
116
+ rubyforge_project: openlogic-feed-normalizer
117
+ rubygems_version: 1.8.15
118
+ signing_key:
119
+ specification_version: 3
120
+ summary: Extensible Ruby wrapper for Atom and RSS parsers
121
+ test_files:
122
+ - test/test_feednormalizer.rb
123
+ - test/test_htmlcleaner.rb