feed-normalizer 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,14 @@
1
+
2
+ 1.2.0
3
+
4
+ * Added HtmlCleaner - sanitizes HTML and removes 'bad' URIs to a level suitable
5
+ for 'safe' display inside a web browser. Can be used as a standalone library,
6
+ or as part of the Feed object. See Feed.clean! for details about cleaning a
7
+ Feed instance. Also see HtmlCleaner and its unit tests. Uses Hpricot.
8
+ * Added Feed-diffing. Differences between two feeds can be displayed using
9
+ Feed.diff. Works nicely with YAML for a readable diff.
10
+ * FeedNormalizer.parse now takes a hash for its arguments.
11
+ * Removed FN::Content.
12
+ * Now uses Hoe!
13
+
14
+
File without changes
data/Manifest.txt ADDED
@@ -0,0 +1,19 @@
1
+ History.txt
2
+ License.txt
3
+ Manifest.txt
4
+ Rakefile
5
+ Readme.txt
6
+ lib/feed-normalizer.rb
7
+ lib/html-cleaner.rb
8
+ lib/parsers/rss.rb
9
+ lib/parsers/simple-rss.rb
10
+ lib/structures.rb
11
+ test/data/atom03.xml
12
+ test/data/atom10.xml
13
+ test/data/rdf10.xml
14
+ test/data/rss20.xml
15
+ test/data/rss20diff.xml
16
+ test/data/rss20diff_short.xml
17
+ test/test_all.rb
18
+ test/test_feednormalizer.rb
19
+ test/test_htmlcleaner.rb
data/Rakefile CHANGED
@@ -1,49 +1,25 @@
1
- require 'rubygems'
2
- require 'rake'
3
- require 'rake/testtask'
4
- require 'rake/rdoctask'
5
- require 'rake/clean'
6
- require 'rake/gempackagetask'
1
+ require 'hoe'
7
2
 
8
- PKG_FILES = FileList[
9
- "lib/**/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
10
- ]
11
-
12
- Gem::manage_gems
13
-
14
- task :default => [:test]
15
- task :package => [:test, :doc]
16
-
17
- spec = Gem::Specification.new do |s|
18
- s.name = "feed-normalizer"
19
- s.version = "1.1.0"
3
+ Hoe.new("feed-normalizer", "1.2.0") do |s|
20
4
  s.author = "Andrew A. Smith"
21
5
  s.email = "andy@tinnedfruit.org"
22
- s.homepage = "http://code.google.com/p/feed-normalizer/"
23
- s.platform = Gem::Platform::RUBY
6
+ s.url = "http://feed-normalizer.rubyforge.org/"
24
7
  s.summary = "Extensible Ruby wrapper for Atom and RSS parsers"
25
- s.files = PKG_FILES
26
- s.require_path = "lib"
27
- s.autorequire = "feed-normalizer"
28
- s.has_rdoc = true
29
- s.add_dependency "simple-rss", ">= 1.1"
8
+ s.description = s.paragraphs_of('Readme.txt', 1..2).join("\n\n")
9
+ s.changes = s.paragraphs_of('History.txt', 0..1).join("\n\n")
10
+ s.extra_deps << ["simple-rss", ">= 1.1"]
11
+ s.extra_deps << ["hpricot", ">= 0.4"]
12
+ s.need_zip = true
13
+ s.need_tar = false
30
14
  end
31
15
 
32
- Rake::GemPackageTask.new(spec) do |pkg|
33
- pkg.need_zip = true
34
- end
35
-
36
- Rake::TestTask.new do |t|
37
- t.libs << "test"
38
- t.test_files = FileList['test/*_test.rb']
39
- t.verbose = true
40
- end
41
16
 
42
- desc "Create documentation"
43
- Rake::RDocTask.new("doc") do |rdoc|
44
- rdoc.title = "Feed Normalizer"
45
- rdoc.rdoc_dir = 'doc'
46
- rdoc.rdoc_files.include('README')
47
- rdoc.rdoc_files.include('lib/**/*.rb')
17
+ begin
18
+ require 'rcov/rcovtask'
19
+ Rcov::RcovTask.new("rcov") do |t|
20
+ t.test_files = Dir['test/test_all.rb']
21
+ end
22
+ rescue LoadError
23
+ nil
48
24
  end
49
25
 
@@ -38,6 +38,17 @@ The feed representation stays the same, even though a different parser was used.
38
38
  feed.class # => FeedNormalizer::Feed
39
39
  feed.parser # => SimpleRSS
40
40
 
41
+ == Cleaning / Sanitizing
42
+
43
+ feed.title # => "My Feed > Your Feed"
44
+ feed.entries.first.content # => "<p x='y'>Hello</p><object></object></html>"
45
+ feed.clean!
46
+
47
+ All elements should now be either clean HTML, or HTML escaped strings.
48
+
49
+ feed.title # => "My Feed &gt; Your Feed"
50
+ feed.entries.first.content # => "<p>Hello</p>"
51
+
41
52
  == Extending
42
53
 
43
54
  Implement a parser wrapper by extending the FeedNormalizer::Parser class and overriding
@@ -49,4 +60,4 @@ See FeedNormalizer::RubyRssParser and FeedNormalizer::SimpleRssParser for exampl
49
60
  == Authors
50
61
  * Andrew A. Smith (andy@tinnedfruit.org)
51
62
 
52
- This library is released under the terms of the BSD License (see the LICENSE file for details).
63
+ This library is released under the terms of the BSD License (see the License.txt file for details).
@@ -1,4 +1,5 @@
1
1
  require 'structures'
2
+ require 'html-cleaner'
2
3
 
3
4
  module FeedNormalizer
4
5
 
@@ -88,21 +89,16 @@ module FeedNormalizer
88
89
  # used first, and if try_others is false, it is the only parser used,
89
90
  # otherwise all parsers in the ParserRegistry are attempted next, in
90
91
  # order of priority.
91
- def self.parse(xml, forced_parser=nil, try_others=false)
92
+ def self.parse(xml, opts = {})
92
93
 
93
94
  # Get a string ASAP, as multiple read()'s will start returning nil..
94
95
  xml = xml.respond_to?(:read) ? xml.read : xml.to_s
95
96
 
96
- if forced_parser
97
- result = forced_parser.parse(xml)
97
+ if opts[:force_parser]
98
+ result = opts[:force_parser].parse(xml)
98
99
 
99
- if result
100
- return result
101
- elsif !try_others
102
- return nil
103
- else
104
- # fall through and continue with other parsers
105
- end
100
+ return result if result
101
+ return nil if opts[:try_others] == false
106
102
  end
107
103
 
108
104
  ParserRegistry.parsers.each do |parser|
@@ -0,0 +1,186 @@
1
+ require 'hpricot'
2
+ require 'cgi'
3
+
4
+ module FeedNormalizer
5
+
6
+ # Various methods for cleaning up HTML and preparing it for safe public
7
+ # consumption.
8
+ #
9
+ # Documents used for refrence:
10
+ # - http://www.w3.org/TR/html4/index/attributes.html
11
+ # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
12
+ # - http://feedparser.org/docs/html-sanitization.html
13
+ # - http://code.whytheluckystiff.net/hpricot/wiki
14
+ class HtmlCleaner
15
+
16
+ # allowed html elements.
17
+ HTML_ELEMENTS = %w(
18
+ a abbr acronym address area b bdo big blockquote br button caption center
19
+ cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
20
+ h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
21
+ samp small span strike strong sub sup table tbody td tfoot th thead tr tt
22
+ u ul var
23
+ )
24
+
25
+ # allowed attributes.
26
+ HTML_ATTRS = %w(
27
+ abbr accept accept-charset accesskey align alt axis border cellpadding
28
+ cellspacing char charoff charset checked cite class clear cols colspan
29
+ color compact coords datetime dir disabled for frame headers height href
30
+ hreflang hspace id ismap label lang longdesc maxlength media method
31
+ multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
32
+ scope selected shape size span src start summary tabindex target title
33
+ type usemap valign value vspace width
34
+ )
35
+
36
+ # allowed attributes, but they can contain URIs, extra caution required.
37
+ # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
38
+ HTML_URI_ATTRS = %w(
39
+ href src cite usemap longdesc
40
+ )
41
+
42
+ DODGY_URI_SCHEMES = %w(
43
+ javascript vbscript mocha livescript data
44
+ )
45
+
46
+ class << self
47
+
48
+ # Does this:
49
+ # - Unescape HTML
50
+ # - Parse HTML into tree
51
+ # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
52
+ # - Each tag:
53
+ # - remove tag if not whitelisted
54
+ # - escape HTML tag contents
55
+ # - remove all attributes not on whitelist
56
+ # - extra-scrub URI attrs; see dodgy_uri?
57
+ #
58
+ # Extra (i.e. unmatched) ending tags and comments are removed.
59
+ def clean(str)
60
+ str = unescapeHTML(str)
61
+
62
+ doc = Hpricot(str, :xhtml_strict => true)
63
+ doc = subtree(doc, :body)
64
+
65
+ # get all the tags in the document
66
+ tags = (doc/"*").collect {|e| e.name}
67
+
68
+ # Remove tags that aren't whitelisted.
69
+ remove_tags!(doc, tags - HTML_ELEMENTS)
70
+ remaining_tags = tags & HTML_ELEMENTS
71
+
72
+ # Remove attributes that aren't on the whitelist, or are suspicious URLs.
73
+ (doc/remaining_tags.join(",")).each do |element|
74
+ element.attributes.reject! do |attr,val|
75
+ !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
76
+ end
77
+
78
+ element.attributes = element.attributes.build_hash {|a,v| [a, add_entities(v)]}
79
+ end unless remaining_tags.empty?
80
+
81
+ doc.traverse_text {|t| t.set(add_entities(t.to_s))}
82
+
83
+ # Return the tree, without comments. Ugly way of removing comments,
84
+ # but can't see a way to do this in Hpricot yet.
85
+ doc.to_s.gsub(/<\!--.*-->/mi, '')
86
+ end
87
+
88
+ # For all other feed elements:
89
+ # - Unescape HTML.
90
+ # - Parse HTML into tree (taking 'body' as root, if present)
91
+ # - Takes text out of each tag, and escapes HTML.
92
+ # - Returns all text concatenated.
93
+ def flatten(str)
94
+ str.gsub!("\n", " ")
95
+ str = unescapeHTML(str)
96
+
97
+ doc = Hpricot(str, :xhtml_strict => true)
98
+ doc = subtree(doc, :body)
99
+
100
+ out = ""
101
+ doc.traverse_text {|t| out << add_entities(t.to_s)}
102
+
103
+ return out
104
+ end
105
+
106
+ # Returns true if the given string contains a suspicious URL,
107
+ # i.e. a javascript link.
108
+ #
109
+ # This method rejects javascript, vbscript, livescript, mocha and data URLs.
110
+ # It *could* be refined to only deny dangerous data URLs, however.
111
+ def dodgy_uri?(uri)
112
+
113
+ # special case for poorly-formed entities (missing ';')
114
+ # if these occur *anywhere* within the string, then throw it out.
115
+ return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
116
+
117
+ # Try escaping as both HTML or URI encodings, and then trying
118
+ # each scheme regexp on each
119
+ [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
120
+ DODGY_URI_SCHEMES.each do |scheme|
121
+
122
+ regexp = "#{scheme}:".gsub(/./) do |char|
123
+ "([\000-\037\177\s]*)#{char}"
124
+ end
125
+
126
+ # regexp looks something like
127
+ # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
128
+ return true if (unesc_uri =~ %r{\A#{regexp}}mi)
129
+ end
130
+ end
131
+
132
+ nil
133
+ end
134
+
135
+ # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
136
+ def unescapeHTML(str, xml = true)
137
+ CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
138
+ end
139
+
140
+ # Adds entities where possible.
141
+ # Works like CGI.escapeHTML, but will not escape existing entities;
142
+ # i.e. &#123; will NOT become &amp;#123;
143
+ #
144
+ # This method could be improved by adding a whitelist of html entities.
145
+ def add_entities(str)
146
+ str.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
147
+ end
148
+
149
+ private
150
+
151
+ # Everything below elment, or the just return the doc if element not present.
152
+ def subtree(doc, element)
153
+ doc.at("//#{element}/*") || doc
154
+ end
155
+
156
+ def remove_tags!(doc, tags)
157
+ (doc/tags.join(",")).remove unless tags.empty?
158
+ end
159
+
160
+ end
161
+ end
162
+ end
163
+
164
+
165
+ module Enumerable
166
+ def build_hash
167
+ result = {}
168
+ self.each do |elt|
169
+ key, value = yield elt
170
+ result[key] = value
171
+ end
172
+ result
173
+ end
174
+ end
175
+
176
+ # http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
177
+ # Subject: A simple Hpricot text setter
178
+ # From: Chris Gehlker <canyonrat mac.com>
179
+ # Date: Fri, 11 Aug 2006 03:19:13 +0900
180
+ class Hpricot::Text
181
+ def set(string)
182
+ @content = string
183
+ self.raw_string = string
184
+ end
185
+ end
186
+
data/lib/parsers/rss.rb CHANGED
@@ -52,6 +52,7 @@ module FeedNormalizer
52
52
  :date_published => :pubDate,
53
53
  :urls => :link,
54
54
  :description => :description,
55
+ :content => :description,
55
56
  :title => :title,
56
57
  :authors => :author
57
58
  }
@@ -62,7 +63,6 @@ module FeedNormalizer
62
63
 
63
64
  # custom item elements
64
65
  feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
65
- feed_entry.content.body = rss_item.description
66
66
  feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
67
67
 
68
68
  feed.entries << feed_entry
@@ -53,6 +53,7 @@ module FeedNormalizer
53
53
  :date_published => [:pubDate, :published],
54
54
  :urls => :link,
55
55
  :description => [:description, :summary],
56
+ :content => [:content, :description],
56
57
  :title => :title,
57
58
  :authors => [:author, :contributor]
58
59
  }
@@ -64,7 +65,6 @@ module FeedNormalizer
64
65
  # custom entry elements
65
66
  feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
66
67
  feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
67
- feed_entry.content.body = atomrss_entry.content || atomrss_entry.description
68
68
 
69
69
  feed.entries << feed_entry
70
70
  end
@@ -74,7 +74,7 @@ module FeedNormalizer
74
74
 
75
75
  def self.image(parser)
76
76
  if parser.respond_to?(:image) && parser.image
77
- if parser.image.match /<url>/ # RSS image contains an <url> spec
77
+ if parser.image =~ /<url>/ # RSS image contains an <url> spec
78
78
  parser.image.scan(/<url>(.*)<\/url>/).to_s
79
79
  else
80
80
  parser.image # Atom contains just the url
@@ -90,9 +90,7 @@ module FeedNormalizer
90
90
 
91
91
  # gets the value returned from the method if it overriden, otherwise nil.
92
92
  def self.overridden_value(object, method)
93
- # XXX: hack to find out if the id method is overriden
94
- # Highly dependent upon Method's to_s :(
95
- object.id if object.method(:id).to_s.match /SimpleRSS\#/
93
+ object.class.public_instance_methods(false).include? method
96
94
  end
97
95
 
98
96
  end
data/lib/structures.rb CHANGED
@@ -10,14 +10,12 @@ module FeedNormalizer
10
10
  # Example:
11
11
  # Object contains an array called 'alphas', which looks like [:a, :b, :c].
12
12
  # Call object.alpha and :a is returned.
13
- def method_missing(name)
14
- if name.to_s =~ /[^s]$/ # doesnt end with 's'
15
- plural = :"#{name}s"
16
- if self.respond_to?(plural)
17
- return self.send(plural).first
18
- end
19
- end
20
- nil
13
+ def method_missing(name, *args)
14
+ return self.send(:"#{name}s").first rescue super(name, *args)
15
+ end
16
+
17
+ def respond_to?(x, y=false)
18
+ self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
21
19
  end
22
20
 
23
21
  end
@@ -34,55 +32,126 @@ module FeedNormalizer
34
32
  self.class::ELEMENTS.collect{|el| self.instance_variable_get("@#{el}")==other.instance_variable_get("@#{el}")}.all?)
35
33
  end
36
34
 
37
- end
38
-
39
- # Wraps content used in an Entry. type defaults to :text.
40
- class Content
41
- TYPE = [:text, :html, :xhtml]
42
- attr_accessor :type, :body
35
+ # Returns the difference between two Feed instances as a hash.
36
+ # Any top-level differences in the Feed object as presented as:
37
+ #
38
+ # { :title => [content, other_content] }
39
+ #
40
+ # For differences at the items level, an array of hashes shows the diffs
41
+ # on a per-entry basis. Only entries that differ will contain a hash:
42
+ #
43
+ # { :items => [
44
+ # {:title => ["An article tile", "A new article title"]},
45
+ # {:title => ["one title", "a different title"]} ]}
46
+ #
47
+ # If the number of items in each feed are different, then the count of each
48
+ # is provided instead:
49
+ #
50
+ # { :items => [4,5] }
51
+ #
52
+ # This method can also be useful for human-readable feed comparison if
53
+ # its output is dumped to YAML.
54
+ def diff(other, elements = self.class::ELEMENTS)
55
+ diffs = {}
56
+
57
+ elements.each do |element|
58
+ if other.respond_to?(element)
59
+ self_value = self.send(element)
60
+ other_value = other.send(element)
61
+
62
+ next if self_value == other_value
63
+
64
+ diffs[element] = if other_value.respond_to?(:diff)
65
+ self_value.diff(other_value)
66
+
67
+ elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
68
+
69
+ if self_value.size != other_value.size
70
+ [self_value.size, other_value.size]
71
+ else
72
+ enum_diffs = []
73
+ self_value.each_with_index do |val, index|
74
+ enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
75
+ end
76
+ enum_diffs.reject{|h| h.empty?}
77
+ end
78
+
79
+ else
80
+ [other_value, self_value] unless other_value == self_value
81
+ end
82
+ end
83
+ end
43
84
 
44
- def initialize
45
- @type = :text
85
+ diffs
46
86
  end
47
87
 
48
- def to_s
49
- body
50
- end
88
+ end
51
89
 
52
- def eql?(other)
53
- self == (other)
54
- end
90
+ module ElementCleaner
91
+ # Recursively cleans all elements in place.
92
+ #
93
+ # Only allow tags in whitelist. Always parse the html with a parser and delete
94
+ # all tags that arent on the list.
95
+ #
96
+ # For feed elements that can contain HTML:
97
+ # - feed.(title|description)
98
+ # - feed.entries[n].(title|description|content)
99
+ #
100
+ def clean!
101
+ self.class::SIMPLE_ELEMENTS.each do |element|
102
+ val = self.send(element)
55
103
 
56
- # Equal if the body is the same. Ignores type.
57
- def ==(other)
58
- other.equal?(self) ||
59
- (other.instance_of?(self.class) &&
60
- self.body == other.body)
104
+ send("#{element}=", (val.is_a?(Array) ?
105
+ val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
106
+ end
107
+
108
+ self.class::HTML_ELEMENTS.each do |element|
109
+ send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
110
+ end
111
+
112
+ self.class::BLENDED_ELEMENTS.each do |element|
113
+ self.send(element).collect{|v| v.clean!}
114
+ end
61
115
  end
62
116
  end
63
117
 
118
+
64
119
  # Represents a feed item entry.
65
120
  class Entry
66
- include Singular, ElementEquality
121
+ include Singular, ElementEquality, ElementCleaner
122
+
123
+ HTML_ELEMENTS = [:content, :description, :title]
124
+ SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright]
125
+ BLENDED_ELEMENTS = []
67
126
 
68
- ELEMENTS = [:content, :date_published, :urls, :description, :title, :id, :authors, :copyright]
69
- attr_accessor *ELEMENTS
127
+ ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
128
+
129
+ attr_accessor(*ELEMENTS)
70
130
 
71
131
  def initialize
72
132
  @urls = []
73
133
  @authors = []
74
- @content = Content.new
75
134
  end
76
135
 
77
136
  end
78
137
 
79
138
  # Represents the root element of a feed.
80
139
  class Feed
81
- include Singular, ElementEquality
140
+ include Singular, ElementEquality, ElementCleaner
141
+
142
+ # Elements that can contain HTML fragments.
143
+ HTML_ELEMENTS = [:title, :description]
82
144
 
83
- ELEMENTS = [:title, :description, :id, :last_updated, :copyright, :authors, :urls, :image, :generator, :items]
84
- attr_accessor *ELEMENTS
85
- attr_accessor :parser
145
+ # Elements that contain 'plain' Strings, with HTML escaped.
146
+ SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator]
147
+
148
+ # Elements that contain both HTML and escaped HTML.
149
+ BLENDED_ELEMENTS = [:items]
150
+
151
+ ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
152
+
153
+ attr_accessor(*ELEMENTS)
154
+ attr_accessor(:parser)
86
155
 
87
156
  alias :entries :items
88
157
 
@@ -95,6 +164,7 @@ module FeedNormalizer
95
164
  end
96
165
 
97
166
  def channel() self end
167
+
98
168
  end
99
169
 
100
170
  end
@@ -0,0 +1,49 @@
1
+ <?xml version="1.0" encoding="ISO-8859-1" ?>
2
+ <?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
3
+ <rss version="2.0">
4
+ <channel>
5
+ <title>diff</title>
6
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
7
+ <description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
8
+ <language>en-gb</language>
9
+ <lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
10
+ <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
+ <docs>http://www.bbc.co.uk/syndication/</docs>
12
+ <ttl>15</ttl>
13
+
14
+ <image>
15
+ <title>BBC News</title>
16
+ <url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
17
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
18
+ </image>
19
+
20
+ <item>
21
+ <title>diff</title>
22
+ <description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
23
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
24
+ <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
25
+ <pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
26
+ <category>Click</category>
27
+ </item>
28
+
29
+ <item>
30
+ <title>diff</title>
31
+ <description>diff</description>
32
+ <link>diff</link>
33
+ <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
34
+ <pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
35
+ <category>diff</category>
36
+ </item>
37
+
38
+ <item>
39
+ <title>MP3 player court order overturned</title>
40
+ <description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
41
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
42
+ <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
43
+ <pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
44
+ <category>Technology</category>
45
+ </item>
46
+
47
+ </channel>
48
+ </rss>
49
+
@@ -0,0 +1,40 @@
1
+ <?xml version="1.0" encoding="ISO-8859-1" ?>
2
+ <?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
3
+ <rss version="2.0">
4
+ <channel>
5
+ <title>diff</title>
6
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
7
+ <description>Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.</description>
8
+ <language>en-gb</language>
9
+ <lastBuildDate>Sat, 09 Sep 2006 14:57:06 GMT</lastBuildDate>
10
+ <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
+ <docs>http://www.bbc.co.uk/syndication/</docs>
12
+ <ttl>15</ttl>
13
+
14
+ <image>
15
+ <title>BBC News</title>
16
+ <url>http://news.bbc.co.uk/nol/shared/img/bbc_news_120x60.gif</url>
17
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
18
+ </image>
19
+
20
+ <item>
21
+ <title>diff</title>
22
+ <description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
23
+ <link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
24
+ <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
25
+ <pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
26
+ <category>Click</category>
27
+ </item>
28
+
29
+ <item>
30
+ <title>diff</title>
31
+ <description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
32
+ <link>diff</link>
33
+ <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
34
+ <pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
35
+ <category>diff</category>
36
+ </item>
37
+
38
+ </channel>
39
+ </rss>
40
+
data/test/test_all.rb ADDED
@@ -0,0 +1,6 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
3
+
4
+ require 'test_htmlcleaner'
5
+ require 'test_feednormalizer'
6
+
@@ -0,0 +1,137 @@
1
+ require 'test/unit'
2
+ require 'feed-normalizer'
3
+
4
+ include FeedNormalizer
5
+
6
+ class FeedNormalizerTest < Test::Unit::TestCase
7
+
8
+ XML_FILES = {}
9
+
10
+ data_dir = File.dirname(__FILE__) + '/data'
11
+
12
+ # Load up the xml files
13
+ Dir.open(data_dir).each do |fn|
14
+ next unless fn =~ /[.]xml$/
15
+ XML_FILES[fn.scan(/(.*)[.]/).to_s.to_sym] = File.read(data_dir + "/#{fn}")
16
+ end
17
+
18
+ def test_basic_parse
19
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
20
+ end
21
+
22
+ def test_force_parser
23
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
24
+ :force_parser => RubyRssParser, :try_others => true)
25
+ end
26
+
27
+ def test_force_parser_exclusive
28
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
29
+ :force_parser => RubyRssParser, :try_others => false)
30
+ end
31
+
32
+ def test_ruby_rss_parser
33
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
34
+ :force_parser => RubyRssParser, :try_others => false)
35
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10],
36
+ :force_parser => RubyRssParser, :try_others => false)
37
+ end
38
+
39
+ def test_simple_rss_parser
40
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20],
41
+ :force_parser => SimpleRssParser, :try_others => false)
42
+ assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10],
43
+ :force_parser => SimpleRssParser, :try_others => false)
44
+ end
45
+
46
+ def test_parser_failover_order
47
+ assert_equal SimpleRSS, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => RubyRssParser).parser
48
+ end
49
+
50
+ def test_force_parser_fail
51
+ assert_nil FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => RubyRssParser, :try_others => false)
52
+ end
53
+
54
+ def test_all_parsers_fail
55
+ assert_nil FeedNormalizer::FeedNormalizer.parse("This isn't RSS or Atom!")
56
+ end
57
+
58
+ def test_correct_parser_used
59
+ assert_equal RSS::Parser, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]).parser
60
+ assert_equal SimpleRSS, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]).parser
61
+ end
62
+
63
+ def test_rss
64
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
65
+
66
+ assert_equal "BBC News | Technology | UK Edition", feed.title
67
+ assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
68
+ assert_equal "MP3 player court order overturned", feed.entries.last.title
69
+ assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
70
+ assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.content
71
+ end
72
+
73
+ def test_simplerss
74
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
75
+
76
+ assert_equal "~:caboose", feed.title
77
+ assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
78
+ assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
79
+ assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
80
+
81
+ assert !feed.entries.last.description.include?("google fame")
82
+ assert feed.entries.last.content.include?("google fame")
83
+ end
84
+
85
+ def test_sanity_check
86
+ XML_FILES.keys.each do |xml_file|
87
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
88
+
89
+ assert [feed.title, feed.url, feed.entries.first.url].collect{|e| e.is_a?(String)}.all?, "Not everything was a String in #{xml_file}"
90
+ assert [feed.parser, feed.class].collect{|e| e.is_a?(Class)}.all?
91
+ end
92
+ end
93
+
94
+ def test_feed_equality
95
+ assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
96
+ assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
97
+ assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
98
+ assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
99
+ assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff])
100
+ end
101
+
102
+ def test_feed_diff
103
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
104
+
105
+ diff = feed.diff(FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff]))
106
+ diff_short = feed.diff(FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20diff_short]))
107
+ no_diff = feed.diff(feed)
108
+
109
+ assert diff.keys.all? {|key| [:title, :items].include?(key)}
110
+ assert_equal 2, diff[:items].size
111
+
112
+ assert diff_short.keys.all? {|key| [:title, :items].include?(key)}
113
+ assert_equal [3,2], diff_short[:items]
114
+
115
+ assert no_diff.empty?
116
+ end
117
+
118
+ def test_marshal
119
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
120
+
121
+ assert_nothing_raised { Marshal.load(Marshal.dump(feed)) }
122
+ end
123
+
124
+ def test_method_missing
125
+ assert_raise(NoMethodError) { Feed.new(nil).nonexistant }
126
+ end
127
+
128
+ def test_clean
129
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
130
+
131
+ assert feed.entries.first.content !~ /\<p\>/
132
+ feed.clean!
133
+ assert feed.entries.first.content =~ /\<p\>/
134
+ end
135
+
136
+ end
137
+
@@ -0,0 +1,151 @@
1
+ require 'test/unit'
2
+ require 'html-cleaner'
3
+
4
+ include FeedNormalizer
5
+
6
+ class HtmlCleanerTest < Test::Unit::TestCase
7
+
8
+ def test_unescape
9
+ assert_equal "' ' &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&apos; &#39; &deg;")
10
+ assert_equal "\" &deg;", FeedNormalizer::HtmlCleaner.unescapeHTML("&quot; &deg;")
11
+ assert_equal "\"\"\"\"", FeedNormalizer::HtmlCleaner.unescapeHTML("&#34;&#000000000000000000034;&#x22;&#x0000022;")
12
+ assert_equal "heavily subnet&#8217;d network,", FeedNormalizer::HtmlCleaner.unescapeHTML("heavily subnet&#8217;d network,")
13
+ end
14
+
15
+ def test_add_entities
16
+ assert_equal "x &gt; y", HtmlCleaner.add_entities("x > y")
17
+ assert_equal "1 &amp; 2", HtmlCleaner.add_entities("1 & 2")
18
+ assert_equal "&amp; &#123; &acute; &#x123;", HtmlCleaner.add_entities("& &#123; &acute; &#x123;")
19
+ assert_equal "&amp; &#123; &ACUTE; &#X123A; &#x80f;", HtmlCleaner.add_entities("& &#123; &ACUTE; &#X123A; &#x80f;")
20
+ assert_equal "heavily subnet&#8217;d network,", HtmlCleaner.add_entities("heavily subnet&#8217;d network,")
21
+ end
22
+
23
+ def test_html_clean
24
+ assert_equal "", HtmlCleaner.clean("")
25
+
26
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo > *</p>")
27
+ assert_equal "<p>foo &gt; *</p>", HtmlCleaner.clean("<p>foo &gt; *</p>")
28
+
29
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p foo=bar>para</p>")
30
+ assert_equal "<p>para</p> outsider", HtmlCleaner.clean("<p foo=bar>para</p> outsider")
31
+
32
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></notvalid>")
33
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p></body>")
34
+
35
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><plaintext>")
36
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><object><param></param></object>")
37
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'></iframe>")
38
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><iframe src='http://evil.example.org'>")
39
+
40
+ assert_equal "<p>para</p>", HtmlCleaner.clean("<p>para</p><invalid>invalid</invalid>")
41
+
42
+ assert_equal "<a href=\"http://example.org\">para</a>", HtmlCleaner.clean("<a href='http://example.org'>para</a>")
43
+ assert_equal "<a href=\"http://example.org/proc?a&amp;b\">para</a>", HtmlCleaner.clean("<a href='http://example.org/proc?a&b'>para</a>")
44
+
45
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p></body>")
46
+ assert_equal "<p>two</p>", HtmlCleaner.clean("<p>para</p><body><p>two</p>")
47
+ assert_equal "<p>para</p>&lt;bo /dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo /dy><p>two</p></body>")
48
+ assert_equal "<p>para</p>&lt;bo\\/dy&gt;<p>two</p>", HtmlCleaner.clean("<p>para</p><bo\\/dy><p>two</p></body>")
49
+ assert_equal "<p>para</p><p>two</p>", HtmlCleaner.clean("<p>para</p><body/><p>two</p></body>")
50
+
51
+ assert_equal "<p>one &amp; two</p>", HtmlCleaner.clean(HtmlCleaner.clean("<p>one & two</p>"))
52
+
53
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" ignore=\"this\">para</p>")
54
+ assert_equal "<p id=\"p\">para</p>", HtmlCleaner.clean("<p id=\"p\" onclick=\"this\">para</p>")
55
+
56
+ assert_equal "<img src=\"http://example.org/pic\" />", HtmlCleaner.clean("<img src=\"http://example.org/pic\" />")
57
+ assert_equal "<img />", HtmlCleaner.clean("<img src=\"jav a script:call()\" />")
58
+
59
+ assert_equal "what's new", HtmlCleaner.clean("what&#000039;s new")
60
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("\"what&apos;s new?\"")
61
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.clean("&quot;what&apos;s new?&quot;")
62
+
63
+ # Real-world examples from selected feeds
64
+ assert_equal "I have a heavily subnet&#8217;d/vlan&#8217;d network,", HtmlCleaner.clean("I have a heavily subnet&#8217;d/vlan&#8217;d network,")
65
+
66
+ assert_equal "<pre><blockquote>&lt;%= start_form_tag :action =&gt; &quot;create&quot; %&gt;</blockquote></pre>",
67
+ HtmlCleaner.clean("<pre><blockquote>&lt;%= start_form_tag :action => \"create\" %></blockquote></pre>")
68
+
69
+ assert_equal "<a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\">[link]</a><a href=\"http://reddit.com/info/pyhc/comments\">[more]</a>",
70
+ HtmlCleaner.clean("&lt;a href=\"http://www.mcall.com/news/local/all-smashedmachine1107-cn,0,1574203.story?coll=all-news-hed\"&gt;[link]&lt;/a&gt;&lt;a href=\"http://reddit.com/info/pyhc/comments\"&gt;[more]&lt;/a&gt;")
71
+
72
+
73
+ # Various exploits from the past
74
+ assert_equal "", HtmlCleaner.clean("<_img foo=\"<IFRAME width='80%' height='400' src='http://alive.znep.com/~marcs/passport/grabit.html'></IFRAME>\" >")
75
+ assert_equal "<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&amp;action=force_internal_error&lt;script&gt;alert(document.cookie)&lt;/script&gt;\">link</a>",
76
+ HtmlCleaner.clean("<a href=\"https://bugzilla.mozilla.org/attachment.cgi?id=&action=force_internal_error<script>alert(document.cookie)</script>\">link</a>")
77
+ assert_equal "<img src=\"doesntexist.jpg\" />", HtmlCleaner.clean("<img src='doesntexist.jpg' onerror='alert(document.cookie)'/>")
78
+ assert_equal "<img src=\"'doesntexist.jpg\" />", HtmlCleaner.clean("<img src=\"'doesntexist.jpg\" onmouseover=\"alert('img-ob-11');''\"/>")
79
+ assert_equal "&lt;IMG &quot;&quot;&quot;&gt;&quot;&gt;", HtmlCleaner.clean("<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">")
80
+
81
+ # This doesnt come out as I would like, but the result is still safe.
82
+ # (Apparently, this would work in Gecko.)
83
+ assert HtmlCleaner.clean("<p onclick!\#$%&()*~+-_.,:;?@[/|\\]^=alert(\"XSS\")>para</p>") !~ /\<\>/
84
+ assert_equal "&lt;SCRIPT/XSS SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;", HtmlCleaner.clean("<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>")
85
+
86
+ assert_equal "", HtmlCleaner.clean("<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->")
87
+ assert_equal "<p></p>", HtmlCleaner.clean("<p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
88
+ assert_equal "<p>hi</p><p></p>", HtmlCleaner.clean("<p>hi</p><p><!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]--></p>")
89
+ end
90
+
91
+ def test_html_flatten
92
+ assert_equal "", HtmlCleaner.flatten("")
93
+
94
+ assert_equal "hello", HtmlCleaner.flatten("hello")
95
+ assert_equal "hello world", HtmlCleaner.flatten("hello\nworld")
96
+
97
+ assert_equal "A &gt; B : C", HtmlCleaner.flatten("A > B : C")
98
+ assert_equal "what's new", HtmlCleaner.flatten("what&#39;s new")
99
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten("\"what&apos;s new?\"")
100
+
101
+ assert_equal "we&#8217;ve got &lt;a hre", HtmlCleaner.flatten("we&#8217;ve got <a hre")
102
+
103
+ assert_equal "http://example.org", HtmlCleaner.flatten("http://example.org")
104
+ assert_equal "http://example.org/proc?a&amp;b", HtmlCleaner.flatten("http://example.org/proc?a&b")
105
+
106
+ assert_equal "&quot;what's new?&quot;", HtmlCleaner.flatten(HtmlCleaner.flatten("\"what&apos;s new?\""))
107
+ end
108
+
109
+ def test_dodgy_uri
110
+ # All of these javascript urls work in IE6.
111
+ assert HtmlCleaner.dodgy_uri?("javascript:alert('HI');")
112
+ assert HtmlCleaner.dodgy_uri?(" &#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116; \n :alert('HI');")
113
+ assert HtmlCleaner.dodgy_uri?("JaVaScRiPt:alert('HI');")
114
+ assert HtmlCleaner.dodgy_uri?("JaV \naSc\nRiPt:alert('HI');")
115
+
116
+ # entities lacking ending ';'
117
+ # This only works if they're all packed together without spacing.
118
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39&#41")
119
+ assert HtmlCleaner.dodgy_uri?("&#106&#97&#118&#97&#115&#99&#114&#105&#112&#116&#58&#97&#108&#101&#114&#116&#40&#39&#105&#109&#103&#45&#111&#98&#45&#50&#39 &#41 ; ")
120
+ # catch extra spacing anyway.. support for this is possible, depending where the spaces are.
121
+ assert HtmlCleaner.dodgy_uri?("&#106 &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
122
+ assert HtmlCleaner.dodgy_uri?("&#x06a &#97 &#118 &#97 &#115 &#99 &#114 &#105 &#112 &#116 &#58 &#97 &#108 &#101 &#114 &#116 &#40 &#39 &#105 &#109 &#103 &#45 &#111 &#98 &#45 &#50 &#39 &#41 ; ")
123
+ assert HtmlCleaner.dodgy_uri?("&#106avascript")
124
+ assert HtmlCleaner.dodgy_uri?("&#x06a;avascript")
125
+
126
+ # url-encoded
127
+ assert HtmlCleaner.dodgy_uri?("%6A%61%76%61%73%63%72%69%70%74%3A%61%6C%65%72%74%28%27%69%6D%67%2D%6F%62%2D%33%27%29")
128
+
129
+ # Other evil schemes
130
+ assert HtmlCleaner.dodgy_uri?("vbscript:MsgBox(\"hi\")")
131
+ assert HtmlCleaner.dodgy_uri?("mocha:alert('hi')")
132
+ assert HtmlCleaner.dodgy_uri?("livescript:alert('hi')")
133
+ assert HtmlCleaner.dodgy_uri?("data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K")
134
+
135
+ # Various non-printing chars
136
+ assert HtmlCleaner.dodgy_uri?("javas\0cript:foo()")
137
+ assert HtmlCleaner.dodgy_uri?(" &#14; javascript:foo()")
138
+ assert HtmlCleaner.dodgy_uri?("jav&#x0A;ascript:foo()")
139
+ assert HtmlCleaner.dodgy_uri?("jav&#x09;ascript:foo()")
140
+ assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
141
+
142
+ # The Good
143
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
144
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
145
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
146
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&amp;a=b")
147
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#38;a=b")
148
+ assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&#x56;a=b")
149
+ end
150
+
151
+ end
metadata CHANGED
@@ -3,16 +3,16 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: feed-normalizer
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.1.0
7
- date: 2006-10-05 00:00:00 -07:00
6
+ version: 1.2.0
7
+ date: 2006-11-29 00:00:00 -08:00
8
8
  summary: Extensible Ruby wrapper for Atom and RSS parsers
9
9
  require_paths:
10
10
  - lib
11
11
  email: andy@tinnedfruit.org
12
- homepage: http://code.google.com/p/feed-normalizer/
13
- rubyforge_project:
14
- description:
15
- autorequire: feed-normalizer
12
+ homepage: http://feed-normalizer.rubyforge.org/
13
+ rubyforge_project: feed-normalizer
14
+ description: An extensible Ruby wrapper for Atom and RSS parsers. Feed normalizer wraps various RSS and Atom parsers, and returns a single unified object graph, regardless of the underlying feed format.
15
+ autorequire:
16
16
  default_executable:
17
17
  bindir: bin
18
18
  has_rdoc: true
@@ -28,23 +28,27 @@ cert_chain:
28
28
  authors:
29
29
  - Andrew A. Smith
30
30
  files:
31
+ - History.txt
32
+ - License.txt
33
+ - Manifest.txt
34
+ - Rakefile
35
+ - Readme.txt
31
36
  - lib/feed-normalizer.rb
32
- - lib/parsers
33
- - lib/structures.rb
37
+ - lib/html-cleaner.rb
34
38
  - lib/parsers/rss.rb
35
39
  - lib/parsers/simple-rss.rb
36
- - test/base_test.rb
37
- - test/data
40
+ - lib/structures.rb
38
41
  - test/data/atom03.xml
39
42
  - test/data/atom10.xml
40
43
  - test/data/rdf10.xml
41
44
  - test/data/rss20.xml
42
- - LICENSE
43
- - Rakefile
44
- - README
45
- - RELEASE
46
- test_files: []
47
-
45
+ - test/data/rss20diff.xml
46
+ - test/data/rss20diff_short.xml
47
+ - test/test_all.rb
48
+ - test/test_feednormalizer.rb
49
+ - test/test_htmlcleaner.rb
50
+ test_files:
51
+ - test/test_all.rb
48
52
  rdoc_options: []
49
53
 
50
54
  extra_rdoc_files: []
@@ -56,6 +60,15 @@ extensions: []
56
60
  requirements: []
57
61
 
58
62
  dependencies:
63
+ - !ruby/object:Gem::Dependency
64
+ name: hoe
65
+ version_requirement:
66
+ version_requirements: !ruby/object:Gem::Version::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: 1.1.6
71
+ version:
59
72
  - !ruby/object:Gem::Dependency
60
73
  name: simple-rss
61
74
  version_requirement:
@@ -65,3 +78,12 @@ dependencies:
65
78
  - !ruby/object:Gem::Version
66
79
  version: "1.1"
67
80
  version:
81
+ - !ruby/object:Gem::Dependency
82
+ name: hpricot
83
+ version_requirement:
84
+ version_requirements: !ruby/object:Gem::Version::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: "0.4"
89
+ version:
data/RELEASE DELETED
@@ -1,13 +0,0 @@
1
- Steps to create a release:
2
-
3
- o Update Rakefile version
4
- o Create package
5
- rake clobber
6
- rake package
7
- o Tag release
8
- svn copy https://feed-normalizer.googlecode.com/svn/trunk \
9
- https://feed-normalizer.googlecode.com/svn/tags/RELEASE_[MAJOR]_[MINOR]_[REVISION]
10
- o Upload gem and zip to rubyforge
11
- o Update RAA
12
- o Post rubyforge news
13
-
data/test/base_test.rb DELETED
@@ -1,82 +0,0 @@
1
- $:.unshift(File.dirname(__FILE__) + '/../lib')
2
-
3
- require 'test/unit'
4
- require 'feed-normalizer'
5
-
6
- include FeedNormalizer
7
-
8
- class BaseTest < Test::Unit::TestCase
9
-
10
- XML_FILES = {}
11
-
12
- def setup
13
- data_dir = File.dirname(__FILE__) + '/data'
14
-
15
- # Load up the xml files
16
- Dir.open(data_dir).each do |fn|
17
- next unless fn =~ /[.]xml$/
18
- XML_FILES[fn.scan(/(.*)[.]/).to_s.to_sym] = File.read(data_dir + "/#{fn}")
19
- end
20
- end
21
-
22
-
23
- def test_basic_parse
24
- assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
25
- end
26
-
27
- def test_force_parser
28
- assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], RubyRssParser, true)
29
- end
30
-
31
- def test_force_parser_exclusive
32
- assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], RubyRssParser, false)
33
- end
34
-
35
- def test_ruby_rss_parser
36
- assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], RubyRssParser, false)
37
- assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], RubyRssParser, false)
38
- end
39
-
40
- def test_simple_rss_parser
41
- assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], SimpleRssParser, false)
42
- assert_kind_of Feed, feed=FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], SimpleRssParser, false)
43
- end
44
-
45
- # Attempts to parse a feed that Ruby's RSS can't handle.
46
- # SimpleRSS should provide the parsed feed.
47
- def test_parser_failover_order
48
- assert_kind_of Feed, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
49
- end
50
-
51
- def test_all_parsers_fail
52
- assert_nil FeedNormalizer::FeedNormalizer.parse("This isn't RSS or Atom!")
53
- end
54
-
55
- def test_correct_parser_used
56
- assert_equal RSS::Parser, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]).parser
57
- assert_equal SimpleRSS, FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]).parser
58
- end
59
-
60
- def test_sanity_check
61
- XML_FILES.keys.each do |xml_file|
62
- feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
63
-
64
- assert [feed.title, feed.url, feed.entries.first.url].collect{|e| e.is_a?(String)}.all?, "Not everything was a String in #{xml_file}"
65
- assert [feed.parser, feed.class].collect{|e| e.is_a?(Class)}.all?
66
- end
67
- end
68
-
69
- def test_feed_equality
70
- assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20])
71
- assert_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
72
- assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
73
- assert_not_equal FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20]), FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
74
-
75
- XML_FILES.keys.each do |xml_file|
76
- feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[xml_file])
77
- assert_equal feed, Marshal.load(Marshal.dump(feed))
78
- end
79
-
80
- end
81
-
82
- end