feed-normalizer 1.5.1 → 1.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +48 -48
- data/License.txt +27 -27
- data/Manifest.txt +18 -19
- data/README.txt +63 -63
- data/Rakefile +29 -25
- data/lib/feed-normalizer.rb +149 -149
- data/lib/html-cleaner.rb +181 -190
- data/lib/parsers/rss.rb +110 -95
- data/lib/parsers/simple-rss.rb +138 -137
- data/lib/structures.rb +245 -244
- data/test/data/atom03.xml +128 -127
- data/test/data/atom10.xml +114 -112
- data/test/data/rdf10.xml +1498 -1498
- data/test/data/rss20.xml +64 -63
- data/test/data/rss20diff.xml +59 -59
- data/test/data/rss20diff_short.xml +51 -51
- data/test/test_feednormalizer.rb +265 -267
- data/test/test_htmlcleaner.rb +156 -155
- metadata +99 -63
- data/test/test_all.rb +0 -6
data/test/data/rss20.xml
CHANGED
@@ -1,63 +1,64 @@
|
|
1
|
-
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
-
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
-
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
4
|
-
<channel>
|
5
|
-
<title>BBC News | Technology | UK Edition</title>
|
6
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
-
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
-
<language>en-gb</language>
|
9
|
-
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
-
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
-
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
-
<ttl>15</ttl>
|
13
|
-
<skipHours>
|
14
|
-
<hour>6</hour>
|
15
|
-
<hour>7</hour>
|
16
|
-
<hour>8</hour>
|
17
|
-
<hour>9</hour>
|
18
|
-
<hour>10</hour>
|
19
|
-
<hour>11</hour>
|
20
|
-
</skipHours>
|
21
|
-
<skipDays>
|
22
|
-
<day>Sunday</day>
|
23
|
-
</skipDays>
|
24
|
-
|
25
|
-
<image>
|
26
|
-
<title>BBC News</title>
|
27
|
-
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
28
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
29
|
-
</image>
|
30
|
-
|
31
|
-
<item>
|
32
|
-
<title>Concerns over security software</title>
|
33
|
-
<description><![CDATA[BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.]]></description>
|
34
|
-
<content:encoded
|
35
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
36
|
-
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
37
|
-
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
38
|
-
<category>Click</category>
|
39
|
-
</item>
|
40
|
-
|
41
|
-
<item>
|
42
|
-
<title>Top prize for 'light' inventor</title>
|
43
|
-
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
44
|
-
<content:encoded><![CDATA[<p>test2</p>]]></content:encoded>
|
45
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5328446.stm</link>
|
46
|
-
<
|
47
|
-
<
|
48
|
-
<
|
49
|
-
<category>
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
<
|
55
|
-
<
|
56
|
-
<
|
57
|
-
<
|
58
|
-
<
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
</
|
63
|
-
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
+
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
+
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
4
|
+
<channel>
|
5
|
+
<title>BBC News | Technology | UK Edition</title>
|
6
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
+
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
+
<language>en-gb</language>
|
9
|
+
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
+
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
+
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
+
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
24
|
+
|
25
|
+
<image>
|
26
|
+
<title>BBC News</title>
|
27
|
+
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
28
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
29
|
+
</image>
|
30
|
+
|
31
|
+
<item>
|
32
|
+
<title>Concerns over security software</title>
|
33
|
+
<description><![CDATA[BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.]]></description>
|
34
|
+
<content:encoded> <![CDATA[<p>test1</p>]]> </content:encoded>
|
35
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
36
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
37
|
+
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
38
|
+
<category>Click</category>
|
39
|
+
</item>
|
40
|
+
|
41
|
+
<item>
|
42
|
+
<title>Top prize for 'light' inventor</title>
|
43
|
+
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
44
|
+
<content:encoded><![CDATA[<p>test2</p>]]></content:encoded>
|
45
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5328446.stm</link>
|
46
|
+
<enclosure url="http://websrvr60ny.audiovideoweb.com/ny60web16519/LTN/POA/POA_042905.mp3" length="12619776" type="audio/mpeg"></enclosure>
|
47
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
48
|
+
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
49
|
+
<category>Technology</category>
|
50
|
+
<category>Japan</category>
|
51
|
+
</item>
|
52
|
+
|
53
|
+
<item>
|
54
|
+
<title>MP3 player court order overturned</title>
|
55
|
+
<description><b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
56
|
+
<content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
|
57
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
58
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
59
|
+
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
60
|
+
</item>
|
61
|
+
|
62
|
+
</channel>
|
63
|
+
</rss>
|
64
|
+
|
data/test/data/rss20diff.xml
CHANGED
@@ -1,59 +1,59 @@
|
|
1
|
-
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
-
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
-
<rss version="2.0">
|
4
|
-
<channel>
|
5
|
-
<title>diff</title>
|
6
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
-
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
-
<language>en-gb</language>
|
9
|
-
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
-
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
-
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
-
<ttl>15</ttl>
|
13
|
-
<skipHours>
|
14
|
-
<hour>6</hour>
|
15
|
-
<hour>7</hour>
|
16
|
-
<hour>8</hour>
|
17
|
-
<hour>9</hour>
|
18
|
-
<hour>10</hour>
|
19
|
-
<hour>11</hour>
|
20
|
-
</skipHours>
|
21
|
-
<skipDays>
|
22
|
-
<day>Sunday</day>
|
23
|
-
</skipDays>
|
24
|
-
|
25
|
-
<image>
|
26
|
-
<title>BBC News</title>
|
27
|
-
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
28
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
29
|
-
</image>
|
30
|
-
|
31
|
-
<item>
|
32
|
-
<title>diff</title>
|
33
|
-
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
34
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
35
|
-
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
36
|
-
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
37
|
-
<category>Click</category>
|
38
|
-
</item>
|
39
|
-
|
40
|
-
<item>
|
41
|
-
<title>diff</title>
|
42
|
-
<description>diff</description>
|
43
|
-
<link>diff</link>
|
44
|
-
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
45
|
-
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
46
|
-
<category>diff</category>
|
47
|
-
</item>
|
48
|
-
|
49
|
-
<item>
|
50
|
-
<title>MP3 player court order overturned</title>
|
51
|
-
<description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
52
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
53
|
-
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
54
|
-
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
55
|
-
</item>
|
56
|
-
|
57
|
-
</channel>
|
58
|
-
</rss>
|
59
|
-
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
+
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
+
<rss version="2.0">
|
4
|
+
<channel>
|
5
|
+
<title>diff</title>
|
6
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
+
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
+
<language>en-gb</language>
|
9
|
+
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
+
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
+
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
+
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
24
|
+
|
25
|
+
<image>
|
26
|
+
<title>BBC News</title>
|
27
|
+
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
28
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
29
|
+
</image>
|
30
|
+
|
31
|
+
<item>
|
32
|
+
<title>diff</title>
|
33
|
+
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
34
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
35
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
36
|
+
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
37
|
+
<category>Click</category>
|
38
|
+
</item>
|
39
|
+
|
40
|
+
<item>
|
41
|
+
<title>diff</title>
|
42
|
+
<description>diff</description>
|
43
|
+
<link>diff</link>
|
44
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
45
|
+
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
46
|
+
<category>diff</category>
|
47
|
+
</item>
|
48
|
+
|
49
|
+
<item>
|
50
|
+
<title>MP3 player court order overturned</title>
|
51
|
+
<description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
52
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
53
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
54
|
+
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
55
|
+
</item>
|
56
|
+
|
57
|
+
</channel>
|
58
|
+
</rss>
|
59
|
+
|
@@ -1,51 +1,51 @@
|
|
1
|
-
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
-
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
-
<rss version="2.0">
|
4
|
-
<channel>
|
5
|
-
<title>diff</title>
|
6
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
-
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
-
<language>en-gb</language>
|
9
|
-
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
-
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
-
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
-
<ttl>15</ttl>
|
13
|
-
<skipHours>
|
14
|
-
<hour>6</hour>
|
15
|
-
<hour>7</hour>
|
16
|
-
<hour>8</hour>
|
17
|
-
<hour>9</hour>
|
18
|
-
<hour>10</hour>
|
19
|
-
<hour>11</hour>
|
20
|
-
</skipHours>
|
21
|
-
<skipDays>
|
22
|
-
<day>Sunday</day>
|
23
|
-
</skipDays>
|
24
|
-
|
25
|
-
<image>
|
26
|
-
<title>BBC News</title>
|
27
|
-
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
28
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
29
|
-
</image>
|
30
|
-
|
31
|
-
<item>
|
32
|
-
<title>diff</title>
|
33
|
-
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
34
|
-
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
35
|
-
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
36
|
-
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
37
|
-
<category>Click</category>
|
38
|
-
</item>
|
39
|
-
|
40
|
-
<item>
|
41
|
-
<title>diff</title>
|
42
|
-
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
43
|
-
<link>diff</link>
|
44
|
-
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
45
|
-
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
46
|
-
<category>diff</category>
|
47
|
-
</item>
|
48
|
-
|
49
|
-
</channel>
|
50
|
-
</rss>
|
51
|
-
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
|
+
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
+
<rss version="2.0">
|
4
|
+
<channel>
|
5
|
+
<title>diff</title>
|
6
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
7
|
+
<description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
|
8
|
+
<language>en-gb</language>
|
9
|
+
<lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
|
10
|
+
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
|
+
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
|
+
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
24
|
+
|
25
|
+
<image>
|
26
|
+
<title>BBC News</title>
|
27
|
+
<url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
|
28
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
29
|
+
</image>
|
30
|
+
|
31
|
+
<item>
|
32
|
+
<title>diff</title>
|
33
|
+
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
34
|
+
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
35
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
36
|
+
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
37
|
+
<category>Click</category>
|
38
|
+
</item>
|
39
|
+
|
40
|
+
<item>
|
41
|
+
<title>diff</title>
|
42
|
+
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
43
|
+
<link>diff</link>
|
44
|
+
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
45
|
+
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
46
|
+
<category>diff</category>
|
47
|
+
</item>
|
48
|
+
|
49
|
+
</channel>
|
50
|
+
</rss>
|
51
|
+
|
data/test/test_feednormalizer.rb
CHANGED
@@ -1,267 +1,265 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:
|
37
|
-
:force_parser => RubyRssParser, :try_others => false)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:
|
44
|
-
:force_parser => SimpleRssParser, :try_others => false)
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
assert_equal
|
70
|
-
assert_equal [
|
71
|
-
assert_equal
|
72
|
-
assert_equal
|
73
|
-
assert_equal
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
assert_equal
|
84
|
-
assert_equal
|
85
|
-
assert_equal
|
86
|
-
assert_equal
|
87
|
-
assert_equal
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
feed
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
feed
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
feed.
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
feed
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
feed
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
Time.
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
feed
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
end
|
267
|
-
|
1
|
+
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '../lib')))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'feed-normalizer'
|
4
|
+
|
5
|
+
class FeedNormalizerTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
XML_FILES = {}
|
8
|
+
|
9
|
+
Fn = FeedNormalizer
|
10
|
+
|
11
|
+
data_dir = File.dirname(__FILE__) + '/data'
|
12
|
+
|
13
|
+
# Load up the xml files
|
14
|
+
Dir.open(data_dir).each do |fn|
|
15
|
+
next unless fn =~ /[.]xml$/
|
16
|
+
XML_FILES[fn.scan(/(.*)[.]/).to_s.to_sym] = File.read(data_dir + "/#{fn}")
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_basic_parse
|
20
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_force_parser
|
24
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
25
|
+
:force_parser => Fn::RubyRssParser, :try_others => true)
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_force_parser_exclusive
|
29
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
30
|
+
:force_parser => Fn::RubyRssParser, :try_others => false)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_ruby_rss_parser
|
34
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
35
|
+
:force_parser => Fn::RubyRssParser, :try_others => false)
|
36
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10],
|
37
|
+
:force_parser => Fn::RubyRssParser, :try_others => false)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_simple_rss_parser
|
41
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
|
42
|
+
:force_parser => Fn::SimpleRssParser, :try_others => false)
|
43
|
+
assert_kind_of Fn::Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10],
|
44
|
+
:force_parser => Fn::SimpleRssParser, :try_others => false)
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_parser_failover_order
|
48
|
+
assert_equal 'SimpleRSS', FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => Fn::RubyRssParser).parser
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_force_parser_fail
|
52
|
+
assert_nil FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => Fn::RubyRssParser, :try_others => false)
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_all_parsers_fail
|
56
|
+
assert_nil FeedNormalizer::FeedNormalizer.parse("This isn't RSS or Atom!")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_correct_parser_used
|
60
|
+
assert_equal 'RSS::Parser', FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]).parser
|
61
|
+
assert_equal 'SimpleRSS', FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]).parser
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_rss
|
65
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
66
|
+
|
67
|
+
assert_equal "BBC News | Technology | UK Edition", feed.title
|
68
|
+
assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
|
69
|
+
assert_equal 15, feed.ttl
|
70
|
+
assert_equal [6, 7, 8, 9, 10, 11], feed.skip_hours
|
71
|
+
assert_equal ["Sunday"], feed.skip_days
|
72
|
+
assert_equal "MP3 player court order overturned", feed.entries.last.title
|
73
|
+
assert_equal "<b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
74
|
+
assert_match(/test\d/, feed.entries.last.content)
|
75
|
+
assert_instance_of Time, feed.entries.last.date_published
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_simplerss
|
79
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
80
|
+
|
81
|
+
assert_equal "~:caboose", feed.title
|
82
|
+
assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
|
83
|
+
assert_equal nil, feed.ttl
|
84
|
+
assert_equal [], feed.skip_hours
|
85
|
+
assert_equal [], feed.skip_days
|
86
|
+
assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
|
87
|
+
assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
|
88
|
+
|
89
|
+
assert !feed.entries.last.description.include?("google fame")
|
90
|
+
assert feed.entries.last.content.include?("google fame")
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_sanity_check
|
94
|
+
XML_FILES.keys.each do |xml_file|
|
95
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
|
96
|
+
|
97
|
+
assert [feed.parser, feed.title, feed.url, feed.entries.first.url].collect{|e| e.is_a?(String)}.all?, "Not everything was a String in #{xml_file}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_feed_equality
|
102
|
+
assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
103
|
+
assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
104
|
+
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
105
|
+
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
106
|
+
assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff])
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_feed_diff
|
110
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
111
|
+
|
112
|
+
diff = feed.diff(FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff]))
|
113
|
+
diff_short = feed.diff(FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff_short]))
|
114
|
+
no_diff = feed.diff(feed)
|
115
|
+
|
116
|
+
assert diff.keys.all? {|key| [:title, :items].include?(key)}
|
117
|
+
assert_equal 3, diff[:items].size
|
118
|
+
|
119
|
+
assert diff_short.keys.all? {|key| [:title, :items].include?(key)}
|
120
|
+
assert_equal [3,2], diff_short[:items]
|
121
|
+
|
122
|
+
assert no_diff.empty?
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_marshal
|
126
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
127
|
+
|
128
|
+
assert_nothing_raised { Marshal.load(Marshal.dump(feed)) }
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_yaml
|
132
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
|
133
|
+
assert_nothing_raised { YAML.load(YAML.dump(feed)) }
|
134
|
+
end
|
135
|
+
|
136
|
+
def test_method_missing
|
137
|
+
assert_raise(NoMethodError) { Fn::Feed.new(nil).nonexistant }
|
138
|
+
end
|
139
|
+
|
140
|
+
def test_clean
|
141
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
142
|
+
|
143
|
+
assert_match(/<plaintext>/, feed.entries.first.content)
|
144
|
+
assert_match(/<plaintext>/, feed.entries.first.description)
|
145
|
+
feed.clean!
|
146
|
+
assert_no_match(/<plaintext>/, feed.entries.first.content)
|
147
|
+
assert_no_match(/<plaintext>/, feed.entries.first.description)
|
148
|
+
end
|
149
|
+
|
150
|
+
def test_malformed_feed
|
151
|
+
assert_nothing_raised { FeedNormalizer::FeedNormalizer.parse('<feed></feed>') }
|
152
|
+
end
|
153
|
+
|
154
|
+
def test_dublin_core_date_ruby_rss
|
155
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => Fn::RubyRssParser, :try_others => false)
|
156
|
+
assert_instance_of Time, feed.entries.first.date_published
|
157
|
+
end
|
158
|
+
|
159
|
+
def test_dublin_core_date_simple_rss
|
160
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
161
|
+
assert_instance_of Time, feed.entries.first.date_published
|
162
|
+
end
|
163
|
+
|
164
|
+
def test_dublin_core_creator_ruby_rss
|
165
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => Fn::RubyRssParser, :try_others => false)
|
166
|
+
assert_equal 'Jeff Hecht', feed.entries.last.author
|
167
|
+
end
|
168
|
+
|
169
|
+
def test_dublin_core_creator_simple_rss
|
170
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
171
|
+
assert_equal 'Jeff Hecht', feed.entries.last.author
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_entry_categories_ruby_rss
|
175
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::RubyRssParser, :try_others => false)
|
176
|
+
assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
|
177
|
+
end
|
178
|
+
|
179
|
+
def test_entry_categories_simple_rss
|
180
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
181
|
+
assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
|
182
|
+
end
|
183
|
+
|
184
|
+
def test_loose_categories_ruby_rss
|
185
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::RubyRssParser, :try_others => false, :loose => true)
|
186
|
+
assert_equal [1,2,0], feed.entries.collect{|e|e.categories.size}
|
187
|
+
end
|
188
|
+
|
189
|
+
def test_loose_categories_simple_rss
|
190
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::SimpleRssParser, :try_others => false, :loose => true)
|
191
|
+
assert_equal [1,1,0], feed.entries.collect{|e|e.categories.size}
|
192
|
+
end
|
193
|
+
|
194
|
+
def test_content_encoded_simple_rss
|
195
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
196
|
+
|
197
|
+
feed.entries.each_with_index do |e, i|
|
198
|
+
assert_match(/\s*<p>test#{i+1}<\/p>\s*/, e.content)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def test_content_encoded_ruby_rss
|
203
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::RubyRssParser, :try_others => false)
|
204
|
+
|
205
|
+
feed.entries.each_with_index do |e, i|
|
206
|
+
assert_match(/\s*<p>test#{i+1}<\/p>\s*/, e.content)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def test_atom_content_contains_pluses
|
211
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
212
|
+
|
213
|
+
assert_equal 2, feed.entries.last.content.scan(/\+/).size
|
214
|
+
end
|
215
|
+
|
216
|
+
# http://code.google.com/p/feed-normalizer/issues/detail?id=13
|
217
|
+
def test_times_are_reparsed
|
218
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::RubyRssParser, :try_others => false)
|
219
|
+
|
220
|
+
Time.class_eval "alias :old_to_s :to_s; def to_s(x=1); old_to_s; end"
|
221
|
+
|
222
|
+
assert_equal Time.parse("Sat Sep 09 10:57:06 -0400 2006").to_s, feed.last_updated.to_s(:foo)
|
223
|
+
assert_equal Time.parse("Sat Sep 09 08:45:35 -0400 2006").to_s, feed.entries.first.date_published.to_s(:foo)
|
224
|
+
end
|
225
|
+
|
226
|
+
def test_atom03_has_issued
|
227
|
+
SimpleRSS.class_eval "@@item_tags.delete(:issued)"
|
228
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
229
|
+
assert_nil feed.entries.first.date_published
|
230
|
+
|
231
|
+
SimpleRSS.class_eval "@@item_tags << :issued"
|
232
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
233
|
+
assert_equal "Tue Aug 29 02:31:03 UTC 2006", feed.entries.first.date_published.to_s
|
234
|
+
end
|
235
|
+
|
236
|
+
def test_html_should_be_escaped_by_default
|
237
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::RubyRssParser, :try_others => false)
|
238
|
+
assert_match "<b>SanDisk</b>", feed.items.last.description
|
239
|
+
|
240
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
241
|
+
assert_match "<b>SanDisk</b>", feed.items.last.description
|
242
|
+
end
|
243
|
+
|
244
|
+
def test_relative_links_and_images_should_be_rewritten_with_url_base
|
245
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03])
|
246
|
+
assert_match '<a href="http://www.cheapstingybargains.com/link/tplclick?lid=41000000011334249&pubid=21000000000053626"' +
|
247
|
+
' target=_"blank"><img src="http://www.cheapstingybargains.com/assets/images/product/productDetail/9990000058546711.jpg"' +
|
248
|
+
' width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a>',
|
249
|
+
feed.items.first.content
|
250
|
+
end
|
251
|
+
|
252
|
+
def test_last_updated_simple_rss
|
253
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => Fn::SimpleRssParser, :try_others => false)
|
254
|
+
|
255
|
+
assert_equal Time.parse("Wed Aug 16 09:59:44 -0700 2006"), feed.entries.first.last_updated
|
256
|
+
end
|
257
|
+
|
258
|
+
def test_last_updated_ruby_rss
|
259
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => Fn::RubyRssParser, :try_others => false)
|
260
|
+
|
261
|
+
assert_equal feed.entries.first.date_published, feed.entries.first.last_updated
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
|