feed-normalizer 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Rakefile +1 -1
- data/lib/html-cleaner.rb +6 -3
- data/lib/structures.rb +1 -1
- data/test/test_htmlcleaner.rb +2 -0
- metadata +10 -9
data/History.txt
CHANGED
data/Rakefile
CHANGED
data/lib/html-cleaner.rb
CHANGED
@@ -59,11 +59,13 @@ module FeedNormalizer
|
|
59
59
|
def clean(str)
|
60
60
|
str = unescapeHTML(str)
|
61
61
|
|
62
|
-
doc = Hpricot(str, :
|
62
|
+
doc = Hpricot(str, :fixup_tags => true)
|
63
63
|
doc = subtree(doc, :body)
|
64
64
|
|
65
65
|
# get all the tags in the document
|
66
|
-
|
66
|
+
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
67
|
+
# including text nodes instead of just tagged elements.
|
68
|
+
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
67
69
|
|
68
70
|
# Remove tags that aren't whitelisted.
|
69
71
|
remove_tags!(doc, tags - HTML_ELEMENTS)
|
@@ -109,6 +111,7 @@ module FeedNormalizer
|
|
109
111
|
# This method rejects javascript, vbscript, livescript, mocha and data URLs.
|
110
112
|
# It *could* be refined to only deny dangerous data URLs, however.
|
111
113
|
def dodgy_uri?(uri)
|
114
|
+
uri = uri.to_s
|
112
115
|
|
113
116
|
# special case for poorly-formed entities (missing ';')
|
114
117
|
# if these occur *anywhere* within the string, then throw it out.
|
@@ -143,7 +146,7 @@ module FeedNormalizer
|
|
143
146
|
#
|
144
147
|
# This method could be improved by adding a whitelist of html entities.
|
145
148
|
def add_entities(str)
|
146
|
-
str.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
149
|
+
str.to_s.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
147
150
|
end
|
148
151
|
|
149
152
|
private
|
data/lib/structures.rb
CHANGED
@@ -29,7 +29,7 @@ module FeedNormalizer
|
|
29
29
|
def ==(other)
|
30
30
|
other.equal?(self) ||
|
31
31
|
(other.instance_of?(self.class) &&
|
32
|
-
self.class::ELEMENTS.
|
32
|
+
self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
|
33
33
|
end
|
34
34
|
|
35
35
|
# Returns the difference between two Feed instances as a hash.
|
data/test/test_htmlcleaner.rb
CHANGED
@@ -13,6 +13,7 @@ class HtmlCleanerTest < Test::Unit::TestCase
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def test_add_entities
|
16
|
+
assert_equal "", HtmlCleaner.add_entities(nil)
|
16
17
|
assert_equal "x > y", HtmlCleaner.add_entities("x > y")
|
17
18
|
assert_equal "1 & 2", HtmlCleaner.add_entities("1 & 2")
|
18
19
|
assert_equal "& { ´ ģ", HtmlCleaner.add_entities("& { ´ ģ")
|
@@ -140,6 +141,7 @@ class HtmlCleanerTest < Test::Unit::TestCase
|
|
140
141
|
assert HtmlCleaner.dodgy_uri?("jav\tascript:foo()")
|
141
142
|
|
142
143
|
# The Good
|
144
|
+
assert_nil HtmlCleaner.dodgy_uri?(nil)
|
143
145
|
assert_nil HtmlCleaner.dodgy_uri?("http://example.org")
|
144
146
|
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.html")
|
145
147
|
assert_nil HtmlCleaner.dodgy_uri?("http://example.org/foo.cgi?x=y&a=b")
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.9.2
|
3
3
|
specification_version: 1
|
4
4
|
name: feed-normalizer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date:
|
6
|
+
version: 1.3.0
|
7
|
+
date: 2007-05-22 00:00:00 -07:00
|
8
8
|
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- Andrew A. Smith
|
30
31
|
files:
|
@@ -61,29 +62,29 @@ requirements: []
|
|
61
62
|
|
62
63
|
dependencies:
|
63
64
|
- !ruby/object:Gem::Dependency
|
64
|
-
name:
|
65
|
+
name: simple-rss
|
65
66
|
version_requirement:
|
66
67
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
67
68
|
requirements:
|
68
69
|
- - ">="
|
69
70
|
- !ruby/object:Gem::Version
|
70
|
-
version: 1.1
|
71
|
+
version: "1.1"
|
71
72
|
version:
|
72
73
|
- !ruby/object:Gem::Dependency
|
73
|
-
name:
|
74
|
+
name: hpricot
|
74
75
|
version_requirement:
|
75
76
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
76
77
|
requirements:
|
77
78
|
- - ">="
|
78
79
|
- !ruby/object:Gem::Version
|
79
|
-
version: "
|
80
|
+
version: "0.4"
|
80
81
|
version:
|
81
82
|
- !ruby/object:Gem::Dependency
|
82
|
-
name:
|
83
|
+
name: hoe
|
83
84
|
version_requirement:
|
84
85
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
85
86
|
requirements:
|
86
87
|
- - ">="
|
87
88
|
- !ruby/object:Gem::Version
|
88
|
-
version:
|
89
|
+
version: 1.2.0
|
89
90
|
version:
|