hpricot_scrub 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.txt +9 -0
- data/Manifest.txt +2 -1
- data/lib/hpricot_scrub/hpricot_scrub.rb +203 -45
- data/lib/hpricot_scrub/version.rb +2 -2
- data/test/hpricot_scrub_test.rb +101 -57
- data/test/old_hpricot_scrub_test.rb +88 -0
- metadata +13 -8
- /data/examples/{config.yml → old_config.yml} +0 -0
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
|
2
|
+
Release 0.3.0
|
3
|
+
Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
|
4
|
+
functionality:
|
5
|
+
- Deprecate config keys :allow_tags, :remove_tags and :allow_attributes
|
6
|
+
- Introduce config keys :elem_rules, :default_elem_rule,
|
7
|
+
:default_comment_rule and :default_attribute_rule
|
8
|
+
- Document the above (inline - visible in rdoc & the likes)
|
9
|
+
|
1
10
|
2007-04-05 Michael <michael@underpantsgnome.com>
|
2
11
|
Release 0.2.3
|
3
12
|
Add patches from Eric Wong
|
data/Manifest.txt
CHANGED
@@ -9,14 +9,120 @@ end
|
|
9
9
|
require 'hpricot'
|
10
10
|
|
11
11
|
module Hpricot
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
|
13
|
+
class Scrub
|
14
|
+
|
15
|
+
def self.normalize_config(config) #:nodoc:#
|
16
|
+
config = {} unless config.is_a?(Hash)
|
17
|
+
|
18
|
+
return config if config[:normalized]
|
19
|
+
|
20
|
+
config = {
|
21
|
+
|
22
|
+
# Legacy config keys:
|
23
|
+
:remove_tags => [],
|
24
|
+
:allow_tags => [],
|
25
|
+
:allow_attributes => [],
|
26
|
+
|
27
|
+
# New fine-grained hotness:
|
28
|
+
:elem_rules => {
|
29
|
+
"script" => false,
|
30
|
+
"style" => false
|
31
|
+
},
|
32
|
+
:default_elem_rule => :strip,
|
33
|
+
:default_comment_rule => false,
|
34
|
+
:default_attribute_rule => false
|
35
|
+
|
36
|
+
}.merge(config)
|
37
|
+
|
38
|
+
#
|
39
|
+
# Merge+delete legacy config keys
|
40
|
+
#
|
41
|
+
# :remove_tags
|
42
|
+
(config.delete(:remove_tags) || []).each do |tag|
|
43
|
+
config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
|
44
|
+
end
|
45
|
+
# :allow_tags
|
46
|
+
(config.delete(:allow_tags) || []).each do |tag|
|
47
|
+
config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
|
48
|
+
end
|
49
|
+
# :allow_attributes
|
50
|
+
(config.delete(:allow_attributes) || []).each do |attribute|
|
51
|
+
#
|
52
|
+
# Add it to the default attribute rule
|
53
|
+
#
|
54
|
+
old_rule = config[:default_attribute_rule]
|
55
|
+
config[:default_attribute_rule] = Proc.new do |parent_element, key, value|
|
56
|
+
if key == attribute
|
57
|
+
true
|
58
|
+
else
|
59
|
+
Scrub::keep_attribute?(parent_element, key, value, old_rule)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
config[:normalized] = true
|
65
|
+
return config
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# Takes:
|
70
|
+
#
|
71
|
+
# An element
|
72
|
+
# An attribute key found in that element
|
73
|
+
# The attribute value attached to the key
|
74
|
+
# An attribute rule
|
75
|
+
#
|
76
|
+
# Checks the rule aginst the attribute and returns:
|
77
|
+
#
|
78
|
+
# true = the attribute should be kept
|
79
|
+
# false = the attribute should NOT be kept
|
80
|
+
#
|
81
|
+
# Acceptable attribute rules are:
|
82
|
+
#
|
83
|
+
# true: Keep the attribute without inspection
|
84
|
+
# a String: Attribute value must be the same as the string
|
85
|
+
# an Array: Attribute key must exist in the array
|
86
|
+
# a Regexp: Attribute value must match the regexp
|
87
|
+
# a Hash: The attribute key is found in the hash, and the value is considered a new rule and follows these same rules via recursion
|
88
|
+
# a Proc: The Proc is called with arguments (parent_element, key, value), the returned value is considered a new rule and follows these same rules via recursion
|
89
|
+
# otherwise: Remove the attribute
|
90
|
+
#
|
91
|
+
def self.keep_attribute?(parent_element, key, value, attribute_rule)
|
92
|
+
|
93
|
+
if attribute_rule == true
|
94
|
+
keep = true
|
95
|
+
elsif attribute_rule.is_a?(String)
|
96
|
+
keep = (attribute_rule == value)
|
97
|
+
elsif attribute_rule.is_a?(Array)
|
98
|
+
keep = attribute_rule.include?(key)
|
99
|
+
elsif attribute_rule.is_a?(Regexp)
|
100
|
+
keep = attribute_rule.match(value)
|
101
|
+
elsif attribute_rule.is_a?(Hash)
|
102
|
+
# Allow hash value to be new rule via recursion
|
103
|
+
new_rule = attribute_rule[key]
|
104
|
+
keep = keep_attribute?(parent_element, key, value, new_rule)
|
105
|
+
elsif attribute_rule.is_a?(Proc)
|
106
|
+
# Allow the proc to return a new rule - recurse:
|
107
|
+
new_rule = attribute_rule.call(parent_element, key, value)
|
108
|
+
keep = keep_attribute?(parent_element, key, value, new_rule)
|
109
|
+
else
|
110
|
+
# Err on the side of caution
|
111
|
+
keep = false
|
112
|
+
end
|
113
|
+
|
114
|
+
return keep
|
115
|
+
|
19
116
|
end
|
117
|
+
|
118
|
+
module Scrubbable
|
119
|
+
def scrubbable?
|
120
|
+
! [ Hpricot::Text,
|
121
|
+
Hpricot::BogusETag,
|
122
|
+
].include?(self.class) && self.respond_to?(:scrub)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
20
126
|
end
|
21
127
|
|
22
128
|
class Elements
|
@@ -25,73 +131,124 @@ module Hpricot
|
|
25
131
|
end
|
26
132
|
|
27
133
|
def strip_attributes(safe=[])
|
28
|
-
each { |x| x.
|
134
|
+
each { |x| x.scrub_attributes(safe) }
|
29
135
|
end
|
30
136
|
end
|
31
137
|
|
32
138
|
class BaseEle
|
33
|
-
include
|
139
|
+
include Scrub::Scrubbable
|
34
140
|
end
|
35
141
|
|
36
|
-
class
|
37
|
-
include
|
142
|
+
class Comment
|
143
|
+
include Scrub::Scrubbable
|
38
144
|
|
39
|
-
def
|
40
|
-
children.
|
41
|
-
strip unless config[:allow_tags].include?(name)
|
145
|
+
def remove
|
146
|
+
parent.children.delete(self)
|
42
147
|
end
|
43
148
|
|
149
|
+
#
|
150
|
+
# Scrubs this comment according to the given config
|
151
|
+
# If the config key :default_comment_rule is true, the comment is kept. Otherwise it's removed.
|
152
|
+
#
|
153
|
+
def scrub(config = nil)
|
154
|
+
config = Scrub::normalize_config(config)
|
155
|
+
rule = config[:default_comment_rule]
|
156
|
+
remove unless rule
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
class Elem
|
163
|
+
include Scrub::Scrubbable
|
164
|
+
|
44
165
|
def remove
|
45
166
|
parent.children.delete(self)
|
46
167
|
end
|
47
168
|
|
48
169
|
def strip
|
49
|
-
|
170
|
+
swap(inner_html)
|
171
|
+
end
|
172
|
+
|
173
|
+
#
|
174
|
+
# Scrubs the element according to the given config
|
175
|
+
# The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
|
176
|
+
# The rule value dictates what happens to the element. The following logic is used:
|
177
|
+
# If the rules is false, the element is removed
|
178
|
+
# If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
|
179
|
+
# Otherwise the element is kept
|
180
|
+
#
|
181
|
+
# If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
|
182
|
+
#
|
183
|
+
# After the above is done, scrub_attributes is called if the element was kept. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
|
184
|
+
#
|
185
|
+
# This is recursive and will do all the above to all the children of the element as well.
|
186
|
+
#
|
187
|
+
def scrub(config = nil)
|
50
188
|
|
51
|
-
|
189
|
+
config = Scrub::normalize_config(config)
|
190
|
+
|
191
|
+
children.reverse.each do |child|
|
192
|
+
child.scrub(config) if child.scrubbable?
|
193
|
+
end
|
194
|
+
|
195
|
+
rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
|
196
|
+
|
197
|
+
if !rule
|
52
198
|
remove
|
199
|
+
elsif rule == :strip
|
200
|
+
strip
|
53
201
|
else
|
54
|
-
|
202
|
+
# Positive rule
|
203
|
+
# Keep the element
|
204
|
+
# On to attributes
|
205
|
+
scrub_attributes(rule == true ? config[:default_attribute_rule] : rule)
|
55
206
|
end
|
207
|
+
|
208
|
+
return self
|
56
209
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
210
|
+
|
211
|
+
#
|
212
|
+
# Loops over all the attributes on this element, and removes any which Hpricot::Scrub.keep_attribute? returns false for
|
213
|
+
#
|
214
|
+
def scrub_attributes(attribute_rule = nil)
|
215
|
+
if attributes
|
216
|
+
attributes.each do |key, value|
|
217
|
+
remove_attribute(key) unless Scrub.keep_attribute?(self, key, value, attribute_rule)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
return true
|
62
221
|
end
|
63
222
|
|
64
|
-
|
65
|
-
# TODO: find other elements that should be removed instead of stripped
|
66
|
-
attributes && attributes['type'] =~ /script|css/
|
67
|
-
end
|
68
|
-
end
|
223
|
+
end #class Elem
|
69
224
|
|
70
225
|
class Doc
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
config
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
self
|
226
|
+
|
227
|
+
#
|
228
|
+
# Scrubs the Hpricot document by removing certain elements and attributes
|
229
|
+
# according to the passed-in config
|
230
|
+
# WARNING: This is destructive. If you want to keep your document untouched use a duplicate copy
|
231
|
+
# See the documentation on Hpricot::Elem#scrub for documentation of config
|
232
|
+
#
|
233
|
+
def scrub(config=nil)
|
234
|
+
config = Scrub::normalize_config(config)
|
235
|
+
children.reverse.each do |child|
|
236
|
+
child.scrub(config) if child.scrubbable?
|
237
|
+
end
|
238
|
+
return self
|
84
239
|
end
|
85
|
-
|
240
|
+
|
241
|
+
end #class Doc
|
242
|
+
|
86
243
|
end
|
87
244
|
|
88
245
|
class String
|
89
|
-
def scrub!
|
90
|
-
self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub.inner_html)
|
246
|
+
def scrub!(config=nil)
|
247
|
+
self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub(config).inner_html)
|
91
248
|
end
|
92
249
|
|
93
|
-
def scrub
|
94
|
-
dup.scrub!
|
250
|
+
def scrub(config=nil)
|
251
|
+
dup.scrub!(config)
|
95
252
|
end
|
96
253
|
end
|
97
254
|
|
@@ -116,4 +273,5 @@ begin
|
|
116
273
|
dup.decode!
|
117
274
|
end
|
118
275
|
end
|
276
|
+
|
119
277
|
rescue LoadError; end
|
data/test/hpricot_scrub_test.rb
CHANGED
@@ -1,84 +1,128 @@
|
|
1
|
+
#
|
2
|
+
# This test case tests HpricotScrub features that were introduced in version 0.3.0
|
3
|
+
# introduction of more fine-grained filtering
|
4
|
+
#
|
1
5
|
require File.dirname(__FILE__) + '/test_helper.rb'
|
2
6
|
require File.dirname(__FILE__) + '/scrubber_data.rb'
|
7
|
+
require "uri"
|
3
8
|
|
4
9
|
class HpricotScrubTest < Test::Unit::TestCase
|
5
10
|
|
6
11
|
def setup
|
7
|
-
@clean = Hpricot(MARKUP).scrub.inner_html
|
8
|
-
@config = YAML.load_file('examples/config.yml')
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
+
config = {
|
14
|
+
:elem_rules => {
|
15
|
+
"a" => {
|
16
|
+
"href" => %r|^https?://|i
|
17
|
+
},
|
18
|
+
"b" => true,
|
19
|
+
"body" => {
|
20
|
+
"lang" => %w(en es fr)
|
21
|
+
},
|
22
|
+
"br" => true,
|
23
|
+
"div" => %w(id class style),
|
24
|
+
"hr" => true,
|
25
|
+
"html" => true,
|
26
|
+
"img" => {
|
27
|
+
"src" => Proc.new do |parent_element, attribute_key, attribute_value|
|
28
|
+
begin
|
29
|
+
uri = URI.parse(attribute_value)
|
30
|
+
uri.is_a?(URI::HTTP) && uri.host != /imageshack/i
|
31
|
+
rescue
|
32
|
+
false
|
33
|
+
end
|
34
|
+
end,
|
35
|
+
"align" => "middle",
|
36
|
+
"alt" => true
|
37
|
+
},
|
38
|
+
"marquee" => :strip,
|
39
|
+
"p" => true,
|
40
|
+
"script" => false,
|
41
|
+
"span" => :strip,
|
42
|
+
"strong" => true,
|
43
|
+
"style" => false
|
44
|
+
},
|
45
|
+
:default_elem_rule => :strip,
|
46
|
+
:default_comment_rule => false,
|
47
|
+
:default_attribute_rule => false
|
48
|
+
}
|
49
|
+
|
50
|
+
@docs = [
|
51
|
+
Hpricot(MARKUP),
|
52
|
+
Hpricot(GOOGLE)
|
53
|
+
]
|
54
|
+
@scrubbed_docs = [
|
55
|
+
Hpricot(MARKUP).scrub(config),
|
56
|
+
Hpricot(GOOGLE).scrub(config)
|
57
|
+
]
|
58
|
+
|
13
59
|
end
|
14
60
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
partial_scrub_common(doc, full)
|
61
|
+
def test_elem_default_rule_strips
|
62
|
+
@scrubbed_docs.each do |doc|
|
63
|
+
assert_equal 0, doc.search("//span").length
|
64
|
+
end
|
20
65
|
end
|
21
66
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
assert_tag_count(doc, 'br', 0)
|
29
|
-
assert_tag_count(doc, 'div', 0)
|
30
|
-
assert_tag_count(doc, 'script', 0)
|
67
|
+
def test_elem_rule_keep
|
68
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
69
|
+
assert_equal @docs[i].search("//a").length, doc.search("//a").length
|
70
|
+
assert_equal @docs[i].search("//b").length, doc.search("//b").length
|
71
|
+
assert_equal @docs[i].search("//img").length, doc.search("//img").length
|
72
|
+
end
|
31
73
|
end
|
32
74
|
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
|
75
|
+
def test_elem_rule_remove
|
76
|
+
@scrubbed_docs.each do |doc|
|
77
|
+
assert_equal 0, doc.search("//script").length
|
78
|
+
assert_equal 0, doc.search("//style").length
|
79
|
+
end
|
37
80
|
end
|
38
81
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
assert_tag_count(doc, 'br', 0)
|
45
|
-
assert_tag_count(doc, 'div', 0)
|
46
|
-
assert_tag_count(doc, 'script', 0)
|
82
|
+
def test_elem_rule_strip
|
83
|
+
@scrubbed_docs.each do |doc|
|
84
|
+
assert_equal 0, doc.search("//marquee").length
|
85
|
+
assert_equal 0, doc.search("//span").length
|
86
|
+
end
|
47
87
|
end
|
48
88
|
|
49
|
-
def
|
50
|
-
|
51
|
-
|
52
|
-
|
89
|
+
def test_attr_default_rule_removes
|
90
|
+
@scrubbed_docs.each do |doc|
|
91
|
+
assert_equal 0, doc.search("*[@mce_src]").length
|
92
|
+
assert_equal 0, doc.search("*[@target]").length
|
93
|
+
end
|
53
94
|
end
|
54
95
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
96
|
+
def test_attr_rule_true
|
97
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
98
|
+
assert_equal @docs[i].search("*[@alt]").length, doc.search("*[@alt]").length
|
99
|
+
end
|
59
100
|
end
|
60
101
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
102
|
+
def test_attr_rule_string
|
103
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
104
|
+
assert_equal @docs[i].search("img[@align='middle']").length, doc.search("img[@align]").length
|
105
|
+
end
|
106
|
+
end
|
65
107
|
|
66
|
-
|
67
|
-
|
108
|
+
def test_attr_rule_regexp
|
109
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
110
|
+
assert_equal @docs[i].search("a[@href^='http']").length, doc.search("a[@href]").length
|
111
|
+
end
|
112
|
+
end
|
68
113
|
|
69
|
-
|
70
|
-
|
71
|
-
|
114
|
+
def test_attr_rule_array
|
115
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
116
|
+
assert_equal @docs[i].search("div[@id]").length, doc.search("div[@id]").length
|
117
|
+
assert_equal @docs[i].search("div[@class]").length, doc.search("div[@class]").length
|
118
|
+
assert_equal @docs[i].search("div[@style]").length, doc.search("div[@style]").length
|
119
|
+
end
|
72
120
|
end
|
73
121
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
assert_tag_count(doc, 'p', full.search('//p').size)
|
79
|
-
assert_tag_count(doc, 'div', full.search('//div').size)
|
80
|
-
assert_tag_count(doc, 'img', full.search('//img').size)
|
81
|
-
assert_tag_count(doc, 'br', full.search('//br').size)
|
82
|
-
assert_tag_count(doc, 'script', 0)
|
122
|
+
def test_attr_rule_proc
|
123
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
124
|
+
assert_equal @docs[i].search("img[@src^='http']").length, doc.search("img[@src]").length
|
125
|
+
end
|
83
126
|
end
|
127
|
+
|
84
128
|
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#
|
2
|
+
# This test case tests HpricotScrub features that were present in version 0.2.3 before
|
3
|
+
# introduction of more fine-grained filtering
|
4
|
+
#
|
5
|
+
require File.dirname(__FILE__) + '/test_helper.rb'
|
6
|
+
require File.dirname(__FILE__) + '/scrubber_data.rb'
|
7
|
+
|
8
|
+
class OldHpricotScrubTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def setup
|
11
|
+
@clean = Hpricot(MARKUP).scrub.inner_html
|
12
|
+
@config = YAML.load_file('examples/old_config.yml')
|
13
|
+
|
14
|
+
# add some tags that most users will probably want
|
15
|
+
@config_full = @config.dup
|
16
|
+
%w(body head html).each { |x| @config_full[:allow_tags].push(x) }
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_full_markup_partial_scrub
|
20
|
+
full = Hpricot(MARKUP)
|
21
|
+
full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
|
22
|
+
doc = Hpricot(full_markup).scrub(@config_full)
|
23
|
+
partial_scrub_common(doc, full)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_full_scrub
|
27
|
+
doc = Hpricot(MARKUP).scrub
|
28
|
+
# using the divisor search throws warnings in test
|
29
|
+
assert_tag_count(doc, 'a', 0)
|
30
|
+
assert_tag_count(doc, 'p', 0)
|
31
|
+
assert_tag_count(doc, 'img', 0)
|
32
|
+
assert_tag_count(doc, 'br', 0)
|
33
|
+
assert_tag_count(doc, 'div', 0)
|
34
|
+
assert_tag_count(doc, 'script', 0)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_partial_scrub
|
38
|
+
full = Hpricot(MARKUP)
|
39
|
+
doc = Hpricot(MARKUP).scrub(@config)
|
40
|
+
partial_scrub_common(doc, full)
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_full_doc
|
44
|
+
doc = Hpricot(GOOGLE).scrub
|
45
|
+
assert_tag_count(doc, 'a', 0)
|
46
|
+
assert_tag_count(doc, 'p', 0)
|
47
|
+
assert_tag_count(doc, 'img', 0)
|
48
|
+
assert_tag_count(doc, 'br', 0)
|
49
|
+
assert_tag_count(doc, 'div', 0)
|
50
|
+
assert_tag_count(doc, 'script', 0)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_string_scrub
|
54
|
+
formatted = MARKUP
|
55
|
+
assert formatted.scrub == @clean
|
56
|
+
assert formatted == MARKUP
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_string_scrub!
|
60
|
+
formatted = MARKUP
|
61
|
+
assert formatted.scrub! == @clean
|
62
|
+
assert formatted == @clean
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_decoder
|
66
|
+
str = 'some <a href="http://example.com/">example link</a> to nowhere'
|
67
|
+
scrubbed_str = str.scrub
|
68
|
+
assert scrubbed_str.include?(' ')
|
69
|
+
|
70
|
+
if defined?(HTMLEntities)
|
71
|
+
assert ! scrubbed_str.decode.include?(' ')
|
72
|
+
|
73
|
+
scrubbed_str.decode!
|
74
|
+
assert ! scrubbed_str.include?(' ')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
def partial_scrub_common(doc, full)
|
80
|
+
# using the divisor search throws warnings in test
|
81
|
+
assert_tag_count(doc, 'a', 0)
|
82
|
+
assert_tag_count(doc, 'p', full.search('//p').size)
|
83
|
+
assert_tag_count(doc, 'div', full.search('//div').size)
|
84
|
+
assert_tag_count(doc, 'img', full.search('//img').size)
|
85
|
+
assert_tag_count(doc, 'br', full.search('//br').size)
|
86
|
+
assert_tag_count(doc, 'script', 0)
|
87
|
+
end
|
88
|
+
end
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot_scrub
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date:
|
6
|
+
version: 0.3.0
|
7
|
+
date: 2008-04-22 00:00:00 -05:00
|
8
8
|
summary: Scrub HTML with Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -40,13 +40,18 @@ files:
|
|
40
40
|
- test/test_helper.rb
|
41
41
|
- test/scrubber_data.rb
|
42
42
|
- test/hpricot_scrub_test.rb
|
43
|
-
-
|
43
|
+
- test/old_hpricot_scrub_test.rb
|
44
|
+
- examples/old_config.yml
|
44
45
|
test_files:
|
45
46
|
- test/hpricot_scrub_test.rb
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
- test/old_hpricot_scrub_test.rb
|
48
|
+
rdoc_options:
|
49
|
+
- --main
|
50
|
+
- README.txt
|
51
|
+
extra_rdoc_files:
|
52
|
+
- README.txt
|
53
|
+
- CHANGELOG.txt
|
54
|
+
- Manifest.txt
|
50
55
|
executables: []
|
51
56
|
|
52
57
|
extensions: []
|
File without changes
|