hpricot_scrub 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.txt +9 -0
- data/Manifest.txt +2 -1
- data/lib/hpricot_scrub/hpricot_scrub.rb +203 -45
- data/lib/hpricot_scrub/version.rb +2 -2
- data/test/hpricot_scrub_test.rb +101 -57
- data/test/old_hpricot_scrub_test.rb +88 -0
- metadata +13 -8
- /data/examples/{config.yml → old_config.yml} +0 -0
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
|
2
|
+
Release 0.3.0
|
3
|
+
Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
|
4
|
+
functionality:
|
5
|
+
- Deprecate config keys :allow_tags, :remove_tags and :allow_attributes
|
6
|
+
- Introduce config keys :elem_rules, :default_elem_rule,
|
7
|
+
:default_comment_rule and :default_attribute_rule
|
8
|
+
- Document the above (inline - visible in rdoc & the likes)
|
9
|
+
|
1
10
|
2007-04-05 Michael <michael@underpantsgnome.com>
|
2
11
|
Release 0.2.3
|
3
12
|
Add patches from Eric Wong
|
data/Manifest.txt
CHANGED
@@ -9,14 +9,120 @@ end
|
|
9
9
|
require 'hpricot'
|
10
10
|
|
11
11
|
module Hpricot
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
|
13
|
+
class Scrub
|
14
|
+
|
15
|
+
def self.normalize_config(config) #:nodoc:#
|
16
|
+
config = {} unless config.is_a?(Hash)
|
17
|
+
|
18
|
+
return config if config[:normalized]
|
19
|
+
|
20
|
+
config = {
|
21
|
+
|
22
|
+
# Legacy config keys:
|
23
|
+
:remove_tags => [],
|
24
|
+
:allow_tags => [],
|
25
|
+
:allow_attributes => [],
|
26
|
+
|
27
|
+
# New fine-grained hotness:
|
28
|
+
:elem_rules => {
|
29
|
+
"script" => false,
|
30
|
+
"style" => false
|
31
|
+
},
|
32
|
+
:default_elem_rule => :strip,
|
33
|
+
:default_comment_rule => false,
|
34
|
+
:default_attribute_rule => false
|
35
|
+
|
36
|
+
}.merge(config)
|
37
|
+
|
38
|
+
#
|
39
|
+
# Merge+delete legacy config keys
|
40
|
+
#
|
41
|
+
# :remove_tags
|
42
|
+
(config.delete(:remove_tags) || []).each do |tag|
|
43
|
+
config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
|
44
|
+
end
|
45
|
+
# :allow_tags
|
46
|
+
(config.delete(:allow_tags) || []).each do |tag|
|
47
|
+
config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
|
48
|
+
end
|
49
|
+
# :allow_attributes
|
50
|
+
(config.delete(:allow_attributes) || []).each do |attribute|
|
51
|
+
#
|
52
|
+
# Add it to the default attribute rule
|
53
|
+
#
|
54
|
+
old_rule = config[:default_attribute_rule]
|
55
|
+
config[:default_attribute_rule] = Proc.new do |parent_element, key, value|
|
56
|
+
if key == attribute
|
57
|
+
true
|
58
|
+
else
|
59
|
+
Scrub::keep_attribute?(parent_element, key, value, old_rule)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
config[:normalized] = true
|
65
|
+
return config
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# Takes:
|
70
|
+
#
|
71
|
+
# An element
|
72
|
+
# An attribute key found in that element
|
73
|
+
# The attribute value attached to the key
|
74
|
+
# An attribute rule
|
75
|
+
#
|
76
|
+
# Checks the rule aginst the attribute and returns:
|
77
|
+
#
|
78
|
+
# true = the attribute should be kept
|
79
|
+
# false = the attribute should NOT be kept
|
80
|
+
#
|
81
|
+
# Acceptable attribute rules are:
|
82
|
+
#
|
83
|
+
# true: Keep the attribute without inspection
|
84
|
+
# a String: Attribute value must be the same as the string
|
85
|
+
# an Array: Attribute key must exist in the array
|
86
|
+
# a Regexp: Attribute value must match the regexp
|
87
|
+
# a Hash: The attribute key is found in the hash, and the value is considered a new rule and follows these same rules via recursion
|
88
|
+
# a Proc: The Proc is called with arguments (parent_element, key, value), the returned value is considered a new rule and follows these same rules via recursion
|
89
|
+
# otherwise: Remove the attribute
|
90
|
+
#
|
91
|
+
def self.keep_attribute?(parent_element, key, value, attribute_rule)
|
92
|
+
|
93
|
+
if attribute_rule == true
|
94
|
+
keep = true
|
95
|
+
elsif attribute_rule.is_a?(String)
|
96
|
+
keep = (attribute_rule == value)
|
97
|
+
elsif attribute_rule.is_a?(Array)
|
98
|
+
keep = attribute_rule.include?(key)
|
99
|
+
elsif attribute_rule.is_a?(Regexp)
|
100
|
+
keep = attribute_rule.match(value)
|
101
|
+
elsif attribute_rule.is_a?(Hash)
|
102
|
+
# Allow hash value to be new rule via recursion
|
103
|
+
new_rule = attribute_rule[key]
|
104
|
+
keep = keep_attribute?(parent_element, key, value, new_rule)
|
105
|
+
elsif attribute_rule.is_a?(Proc)
|
106
|
+
# Allow the proc to return a new rule - recurse:
|
107
|
+
new_rule = attribute_rule.call(parent_element, key, value)
|
108
|
+
keep = keep_attribute?(parent_element, key, value, new_rule)
|
109
|
+
else
|
110
|
+
# Err on the side of caution
|
111
|
+
keep = false
|
112
|
+
end
|
113
|
+
|
114
|
+
return keep
|
115
|
+
|
19
116
|
end
|
117
|
+
|
118
|
+
module Scrubbable
|
119
|
+
def scrubbable?
|
120
|
+
! [ Hpricot::Text,
|
121
|
+
Hpricot::BogusETag,
|
122
|
+
].include?(self.class) && self.respond_to?(:scrub)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
20
126
|
end
|
21
127
|
|
22
128
|
class Elements
|
@@ -25,73 +131,124 @@ module Hpricot
|
|
25
131
|
end
|
26
132
|
|
27
133
|
def strip_attributes(safe=[])
|
28
|
-
each { |x| x.
|
134
|
+
each { |x| x.scrub_attributes(safe) }
|
29
135
|
end
|
30
136
|
end
|
31
137
|
|
32
138
|
class BaseEle
|
33
|
-
include
|
139
|
+
include Scrub::Scrubbable
|
34
140
|
end
|
35
141
|
|
36
|
-
class
|
37
|
-
include
|
142
|
+
class Comment
|
143
|
+
include Scrub::Scrubbable
|
38
144
|
|
39
|
-
def
|
40
|
-
children.
|
41
|
-
strip unless config[:allow_tags].include?(name)
|
145
|
+
def remove
|
146
|
+
parent.children.delete(self)
|
42
147
|
end
|
43
148
|
|
149
|
+
#
|
150
|
+
# Scrubs this comment according to the given config
|
151
|
+
# If the config key :default_comment_rule is true, the comment is kept. Otherwise it's removed.
|
152
|
+
#
|
153
|
+
def scrub(config = nil)
|
154
|
+
config = Scrub::normalize_config(config)
|
155
|
+
rule = config[:default_comment_rule]
|
156
|
+
remove unless rule
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
class Elem
|
163
|
+
include Scrub::Scrubbable
|
164
|
+
|
44
165
|
def remove
|
45
166
|
parent.children.delete(self)
|
46
167
|
end
|
47
168
|
|
48
169
|
def strip
|
49
|
-
|
170
|
+
swap(inner_html)
|
171
|
+
end
|
172
|
+
|
173
|
+
#
|
174
|
+
# Scrubs the element according to the given config
|
175
|
+
# The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
|
176
|
+
# The rule value dictates what happens to the element. The following logic is used:
|
177
|
+
# If the rules is false, the element is removed
|
178
|
+
# If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
|
179
|
+
# Otherwise the element is kept
|
180
|
+
#
|
181
|
+
# If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
|
182
|
+
#
|
183
|
+
# After the above is done, scrub_attributes is called if the element was kept. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
|
184
|
+
#
|
185
|
+
# This is recursive and will do all the above to all the children of the element as well.
|
186
|
+
#
|
187
|
+
def scrub(config = nil)
|
50
188
|
|
51
|
-
|
189
|
+
config = Scrub::normalize_config(config)
|
190
|
+
|
191
|
+
children.reverse.each do |child|
|
192
|
+
child.scrub(config) if child.scrubbable?
|
193
|
+
end
|
194
|
+
|
195
|
+
rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
|
196
|
+
|
197
|
+
if !rule
|
52
198
|
remove
|
199
|
+
elsif rule == :strip
|
200
|
+
strip
|
53
201
|
else
|
54
|
-
|
202
|
+
# Positive rule
|
203
|
+
# Keep the element
|
204
|
+
# On to attributes
|
205
|
+
scrub_attributes(rule == true ? config[:default_attribute_rule] : rule)
|
55
206
|
end
|
207
|
+
|
208
|
+
return self
|
56
209
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
210
|
+
|
211
|
+
#
|
212
|
+
# Loops over all the attributes on this element, and removes any which Hpricot::Scrub.keep_attribute? returns false for
|
213
|
+
#
|
214
|
+
def scrub_attributes(attribute_rule = nil)
|
215
|
+
if attributes
|
216
|
+
attributes.each do |key, value|
|
217
|
+
remove_attribute(key) unless Scrub.keep_attribute?(self, key, value, attribute_rule)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
return true
|
62
221
|
end
|
63
222
|
|
64
|
-
|
65
|
-
# TODO: find other elements that should be removed instead of stripped
|
66
|
-
attributes && attributes['type'] =~ /script|css/
|
67
|
-
end
|
68
|
-
end
|
223
|
+
end #class Elem
|
69
224
|
|
70
225
|
class Doc
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
config
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
self
|
226
|
+
|
227
|
+
#
|
228
|
+
# Scrubs the Hpricot document by removing certain elements and attributes
|
229
|
+
# according to the passed-in config
|
230
|
+
# WARNING: This is destructive. If you want to keep your document untouched use a duplicate copy
|
231
|
+
# See the documentation on Hpricot::Elem#scrub for documentation of config
|
232
|
+
#
|
233
|
+
def scrub(config=nil)
|
234
|
+
config = Scrub::normalize_config(config)
|
235
|
+
children.reverse.each do |child|
|
236
|
+
child.scrub(config) if child.scrubbable?
|
237
|
+
end
|
238
|
+
return self
|
84
239
|
end
|
85
|
-
|
240
|
+
|
241
|
+
end #class Doc
|
242
|
+
|
86
243
|
end
|
87
244
|
|
88
245
|
class String
|
89
|
-
def scrub!
|
90
|
-
self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub.inner_html)
|
246
|
+
def scrub!(config=nil)
|
247
|
+
self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub(config).inner_html)
|
91
248
|
end
|
92
249
|
|
93
|
-
def scrub
|
94
|
-
dup.scrub!
|
250
|
+
def scrub(config=nil)
|
251
|
+
dup.scrub!(config)
|
95
252
|
end
|
96
253
|
end
|
97
254
|
|
@@ -116,4 +273,5 @@ begin
|
|
116
273
|
dup.decode!
|
117
274
|
end
|
118
275
|
end
|
276
|
+
|
119
277
|
rescue LoadError; end
|
data/test/hpricot_scrub_test.rb
CHANGED
@@ -1,84 +1,128 @@
|
|
1
|
+
#
|
2
|
+
# This test case tests HpricotScrub features that were introduced in version 0.3.0
|
3
|
+
# introduction of more fine-grained filtering
|
4
|
+
#
|
1
5
|
require File.dirname(__FILE__) + '/test_helper.rb'
|
2
6
|
require File.dirname(__FILE__) + '/scrubber_data.rb'
|
7
|
+
require "uri"
|
3
8
|
|
4
9
|
class HpricotScrubTest < Test::Unit::TestCase
|
5
10
|
|
6
11
|
def setup
|
7
|
-
@clean = Hpricot(MARKUP).scrub.inner_html
|
8
|
-
@config = YAML.load_file('examples/config.yml')
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
+
config = {
|
14
|
+
:elem_rules => {
|
15
|
+
"a" => {
|
16
|
+
"href" => %r|^https?://|i
|
17
|
+
},
|
18
|
+
"b" => true,
|
19
|
+
"body" => {
|
20
|
+
"lang" => %w(en es fr)
|
21
|
+
},
|
22
|
+
"br" => true,
|
23
|
+
"div" => %w(id class style),
|
24
|
+
"hr" => true,
|
25
|
+
"html" => true,
|
26
|
+
"img" => {
|
27
|
+
"src" => Proc.new do |parent_element, attribute_key, attribute_value|
|
28
|
+
begin
|
29
|
+
uri = URI.parse(attribute_value)
|
30
|
+
uri.is_a?(URI::HTTP) && uri.host != /imageshack/i
|
31
|
+
rescue
|
32
|
+
false
|
33
|
+
end
|
34
|
+
end,
|
35
|
+
"align" => "middle",
|
36
|
+
"alt" => true
|
37
|
+
},
|
38
|
+
"marquee" => :strip,
|
39
|
+
"p" => true,
|
40
|
+
"script" => false,
|
41
|
+
"span" => :strip,
|
42
|
+
"strong" => true,
|
43
|
+
"style" => false
|
44
|
+
},
|
45
|
+
:default_elem_rule => :strip,
|
46
|
+
:default_comment_rule => false,
|
47
|
+
:default_attribute_rule => false
|
48
|
+
}
|
49
|
+
|
50
|
+
@docs = [
|
51
|
+
Hpricot(MARKUP),
|
52
|
+
Hpricot(GOOGLE)
|
53
|
+
]
|
54
|
+
@scrubbed_docs = [
|
55
|
+
Hpricot(MARKUP).scrub(config),
|
56
|
+
Hpricot(GOOGLE).scrub(config)
|
57
|
+
]
|
58
|
+
|
13
59
|
end
|
14
60
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
partial_scrub_common(doc, full)
|
61
|
+
def test_elem_default_rule_strips
|
62
|
+
@scrubbed_docs.each do |doc|
|
63
|
+
assert_equal 0, doc.search("//span").length
|
64
|
+
end
|
20
65
|
end
|
21
66
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
assert_tag_count(doc, 'br', 0)
|
29
|
-
assert_tag_count(doc, 'div', 0)
|
30
|
-
assert_tag_count(doc, 'script', 0)
|
67
|
+
def test_elem_rule_keep
|
68
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
69
|
+
assert_equal @docs[i].search("//a").length, doc.search("//a").length
|
70
|
+
assert_equal @docs[i].search("//b").length, doc.search("//b").length
|
71
|
+
assert_equal @docs[i].search("//img").length, doc.search("//img").length
|
72
|
+
end
|
31
73
|
end
|
32
74
|
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
|
75
|
+
def test_elem_rule_remove
|
76
|
+
@scrubbed_docs.each do |doc|
|
77
|
+
assert_equal 0, doc.search("//script").length
|
78
|
+
assert_equal 0, doc.search("//style").length
|
79
|
+
end
|
37
80
|
end
|
38
81
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
assert_tag_count(doc, 'br', 0)
|
45
|
-
assert_tag_count(doc, 'div', 0)
|
46
|
-
assert_tag_count(doc, 'script', 0)
|
82
|
+
def test_elem_rule_strip
|
83
|
+
@scrubbed_docs.each do |doc|
|
84
|
+
assert_equal 0, doc.search("//marquee").length
|
85
|
+
assert_equal 0, doc.search("//span").length
|
86
|
+
end
|
47
87
|
end
|
48
88
|
|
49
|
-
def
|
50
|
-
|
51
|
-
|
52
|
-
|
89
|
+
def test_attr_default_rule_removes
|
90
|
+
@scrubbed_docs.each do |doc|
|
91
|
+
assert_equal 0, doc.search("*[@mce_src]").length
|
92
|
+
assert_equal 0, doc.search("*[@target]").length
|
93
|
+
end
|
53
94
|
end
|
54
95
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
96
|
+
def test_attr_rule_true
|
97
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
98
|
+
assert_equal @docs[i].search("*[@alt]").length, doc.search("*[@alt]").length
|
99
|
+
end
|
59
100
|
end
|
60
101
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
|
102
|
+
def test_attr_rule_string
|
103
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
104
|
+
assert_equal @docs[i].search("img[@align='middle']").length, doc.search("img[@align]").length
|
105
|
+
end
|
106
|
+
end
|
65
107
|
|
66
|
-
|
67
|
-
|
108
|
+
def test_attr_rule_regexp
|
109
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
110
|
+
assert_equal @docs[i].search("a[@href^='http']").length, doc.search("a[@href]").length
|
111
|
+
end
|
112
|
+
end
|
68
113
|
|
69
|
-
|
70
|
-
|
71
|
-
|
114
|
+
def test_attr_rule_array
|
115
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
116
|
+
assert_equal @docs[i].search("div[@id]").length, doc.search("div[@id]").length
|
117
|
+
assert_equal @docs[i].search("div[@class]").length, doc.search("div[@class]").length
|
118
|
+
assert_equal @docs[i].search("div[@style]").length, doc.search("div[@style]").length
|
119
|
+
end
|
72
120
|
end
|
73
121
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
assert_tag_count(doc, 'p', full.search('//p').size)
|
79
|
-
assert_tag_count(doc, 'div', full.search('//div').size)
|
80
|
-
assert_tag_count(doc, 'img', full.search('//img').size)
|
81
|
-
assert_tag_count(doc, 'br', full.search('//br').size)
|
82
|
-
assert_tag_count(doc, 'script', 0)
|
122
|
+
def test_attr_rule_proc
|
123
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
124
|
+
assert_equal @docs[i].search("img[@src^='http']").length, doc.search("img[@src]").length
|
125
|
+
end
|
83
126
|
end
|
127
|
+
|
84
128
|
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#
|
2
|
+
# This test case tests HpricotScrub features that were present in version 0.2.3 before
|
3
|
+
# introduction of more fine-grained filtering
|
4
|
+
#
|
5
|
+
require File.dirname(__FILE__) + '/test_helper.rb'
|
6
|
+
require File.dirname(__FILE__) + '/scrubber_data.rb'
|
7
|
+
|
8
|
+
class OldHpricotScrubTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def setup
|
11
|
+
@clean = Hpricot(MARKUP).scrub.inner_html
|
12
|
+
@config = YAML.load_file('examples/old_config.yml')
|
13
|
+
|
14
|
+
# add some tags that most users will probably want
|
15
|
+
@config_full = @config.dup
|
16
|
+
%w(body head html).each { |x| @config_full[:allow_tags].push(x) }
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_full_markup_partial_scrub
|
20
|
+
full = Hpricot(MARKUP)
|
21
|
+
full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
|
22
|
+
doc = Hpricot(full_markup).scrub(@config_full)
|
23
|
+
partial_scrub_common(doc, full)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_full_scrub
|
27
|
+
doc = Hpricot(MARKUP).scrub
|
28
|
+
# using the divisor search throws warnings in test
|
29
|
+
assert_tag_count(doc, 'a', 0)
|
30
|
+
assert_tag_count(doc, 'p', 0)
|
31
|
+
assert_tag_count(doc, 'img', 0)
|
32
|
+
assert_tag_count(doc, 'br', 0)
|
33
|
+
assert_tag_count(doc, 'div', 0)
|
34
|
+
assert_tag_count(doc, 'script', 0)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_partial_scrub
|
38
|
+
full = Hpricot(MARKUP)
|
39
|
+
doc = Hpricot(MARKUP).scrub(@config)
|
40
|
+
partial_scrub_common(doc, full)
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_full_doc
|
44
|
+
doc = Hpricot(GOOGLE).scrub
|
45
|
+
assert_tag_count(doc, 'a', 0)
|
46
|
+
assert_tag_count(doc, 'p', 0)
|
47
|
+
assert_tag_count(doc, 'img', 0)
|
48
|
+
assert_tag_count(doc, 'br', 0)
|
49
|
+
assert_tag_count(doc, 'div', 0)
|
50
|
+
assert_tag_count(doc, 'script', 0)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_string_scrub
|
54
|
+
formatted = MARKUP
|
55
|
+
assert formatted.scrub == @clean
|
56
|
+
assert formatted == MARKUP
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_string_scrub!
|
60
|
+
formatted = MARKUP
|
61
|
+
assert formatted.scrub! == @clean
|
62
|
+
assert formatted == @clean
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_decoder
|
66
|
+
str = 'some <a href="http://example.com/">example link</a> to nowhere'
|
67
|
+
scrubbed_str = str.scrub
|
68
|
+
assert scrubbed_str.include?(' ')
|
69
|
+
|
70
|
+
if defined?(HTMLEntities)
|
71
|
+
assert ! scrubbed_str.decode.include?(' ')
|
72
|
+
|
73
|
+
scrubbed_str.decode!
|
74
|
+
assert ! scrubbed_str.include?(' ')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
def partial_scrub_common(doc, full)
|
80
|
+
# using the divisor search throws warnings in test
|
81
|
+
assert_tag_count(doc, 'a', 0)
|
82
|
+
assert_tag_count(doc, 'p', full.search('//p').size)
|
83
|
+
assert_tag_count(doc, 'div', full.search('//div').size)
|
84
|
+
assert_tag_count(doc, 'img', full.search('//img').size)
|
85
|
+
assert_tag_count(doc, 'br', full.search('//br').size)
|
86
|
+
assert_tag_count(doc, 'script', 0)
|
87
|
+
end
|
88
|
+
end
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot_scrub
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date:
|
6
|
+
version: 0.3.0
|
7
|
+
date: 2008-04-22 00:00:00 -05:00
|
8
8
|
summary: Scrub HTML with Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -40,13 +40,18 @@ files:
|
|
40
40
|
- test/test_helper.rb
|
41
41
|
- test/scrubber_data.rb
|
42
42
|
- test/hpricot_scrub_test.rb
|
43
|
-
-
|
43
|
+
- test/old_hpricot_scrub_test.rb
|
44
|
+
- examples/old_config.yml
|
44
45
|
test_files:
|
45
46
|
- test/hpricot_scrub_test.rb
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
- test/old_hpricot_scrub_test.rb
|
48
|
+
rdoc_options:
|
49
|
+
- --main
|
50
|
+
- README.txt
|
51
|
+
extra_rdoc_files:
|
52
|
+
- README.txt
|
53
|
+
- CHANGELOG.txt
|
54
|
+
- Manifest.txt
|
50
55
|
executables: []
|
51
56
|
|
52
57
|
extensions: []
|
File without changes
|