hpricot_scrub 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.txt CHANGED
@@ -1,3 +1,12 @@
1
+ 2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
2
+ Release 0.3.0
3
+ Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
4
+ functionality:
5
+ - Deprecate config keys :allow_tags, :remove_tags and :allow_attributes
6
+ - Introduce config keys :elem_rules, :default_elem_rule,
7
+ :default_comment_rule and :default_attribute_rule
8
+ - Document the above (inline - visible in rdoc & the likes)
9
+
1
10
  2007-04-05 Michael <michael@underpantsgnome.com>
2
11
  Release 0.2.3
3
12
  Add patches from Eric Wong
data/Manifest.txt CHANGED
@@ -9,4 +9,5 @@ lib/hpricot_scrub.rb
9
9
  test/test_helper.rb
10
10
  test/scrubber_data.rb
11
11
  test/hpricot_scrub_test.rb
12
- examples/config.yml
12
+ test/old_hpricot_scrub_test.rb
13
+ examples/old_config.yml
@@ -9,14 +9,120 @@ end
9
9
  require 'hpricot'
10
10
 
11
11
  module Hpricot
12
- module Scrubable
13
- # TODO: figure out how to handle comments
14
- def scrubable?
15
- ! [ Hpricot::Text,
16
- Hpricot::BogusETag,
17
- Hpricot::Comment
18
- ].include?(self.class) && self.respond_to?(:scrub)
12
+
13
+ class Scrub
14
+
15
+ def self.normalize_config(config) #:nodoc:#
16
+ config = {} unless config.is_a?(Hash)
17
+
18
+ return config if config[:normalized]
19
+
20
+ config = {
21
+
22
+ # Legacy config keys:
23
+ :remove_tags => [],
24
+ :allow_tags => [],
25
+ :allow_attributes => [],
26
+
27
+ # New fine-grained hotness:
28
+ :elem_rules => {
29
+ "script" => false,
30
+ "style" => false
31
+ },
32
+ :default_elem_rule => :strip,
33
+ :default_comment_rule => false,
34
+ :default_attribute_rule => false
35
+
36
+ }.merge(config)
37
+
38
+ #
39
+ # Merge+delete legacy config keys
40
+ #
41
+ # :remove_tags
42
+ (config.delete(:remove_tags) || []).each do |tag|
43
+ config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
44
+ end
45
+ # :allow_tags
46
+ (config.delete(:allow_tags) || []).each do |tag|
47
+ config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
48
+ end
49
+ # :allow_attributes
50
+ (config.delete(:allow_attributes) || []).each do |attribute|
51
+ #
52
+ # Add it to the default attribute rule
53
+ #
54
+ old_rule = config[:default_attribute_rule]
55
+ config[:default_attribute_rule] = Proc.new do |parent_element, key, value|
56
+ if key == attribute
57
+ true
58
+ else
59
+ Scrub::keep_attribute?(parent_element, key, value, old_rule)
60
+ end
61
+ end
62
+ end
63
+
64
+ config[:normalized] = true
65
+ return config
66
+ end
67
+
68
+ #
69
+ # Takes:
70
+ #
71
+ # An element
72
+ # An attribute key found in that element
73
+ # The attribute value attached to the key
74
+ # An attribute rule
75
+ #
76
+ # Checks the rule aginst the attribute and returns:
77
+ #
78
+ # true = the attribute should be kept
79
+ # false = the attribute should NOT be kept
80
+ #
81
+ # Acceptable attribute rules are:
82
+ #
83
+ # true: Keep the attribute without inspection
84
+ # a String: Attribute value must be the same as the string
85
+ # an Array: Attribute key must exist in the array
86
+ # a Regexp: Attribute value must match the regexp
87
+ # a Hash: The attribute key is found in the hash, and the value is considered a new rule and follows these same rules via recursion
88
+ # a Proc: The Proc is called with arguments (parent_element, key, value), the returned value is considered a new rule and follows these same rules via recursion
89
+ # otherwise: Remove the attribute
90
+ #
91
+ def self.keep_attribute?(parent_element, key, value, attribute_rule)
92
+
93
+ if attribute_rule == true
94
+ keep = true
95
+ elsif attribute_rule.is_a?(String)
96
+ keep = (attribute_rule == value)
97
+ elsif attribute_rule.is_a?(Array)
98
+ keep = attribute_rule.include?(key)
99
+ elsif attribute_rule.is_a?(Regexp)
100
+ keep = attribute_rule.match(value)
101
+ elsif attribute_rule.is_a?(Hash)
102
+ # Allow hash value to be new rule via recursion
103
+ new_rule = attribute_rule[key]
104
+ keep = keep_attribute?(parent_element, key, value, new_rule)
105
+ elsif attribute_rule.is_a?(Proc)
106
+ # Allow the proc to return a new rule - recurse:
107
+ new_rule = attribute_rule.call(parent_element, key, value)
108
+ keep = keep_attribute?(parent_element, key, value, new_rule)
109
+ else
110
+ # Err on the side of caution
111
+ keep = false
112
+ end
113
+
114
+ return keep
115
+
19
116
  end
117
+
118
+ module Scrubbable
119
+ def scrubbable?
120
+ ! [ Hpricot::Text,
121
+ Hpricot::BogusETag,
122
+ ].include?(self.class) && self.respond_to?(:scrub)
123
+ end
124
+ end
125
+
20
126
  end
21
127
 
22
128
  class Elements
@@ -25,73 +131,124 @@ module Hpricot
25
131
  end
26
132
 
27
133
  def strip_attributes(safe=[])
28
- each { |x| x.strip_attributes(safe) }
134
+ each { |x| x.scrub_attributes(safe) }
29
135
  end
30
136
  end
31
137
 
32
138
  class BaseEle
33
- include Scrubable
139
+ include Scrub::Scrubbable
34
140
  end
35
141
 
36
- class Elem
37
- include Scrubable
142
+ class Comment
143
+ include Scrub::Scrubbable
38
144
 
39
- def scrub(config)
40
- children.reverse.each { |c| c.scrub(config) if c.scrubable? }
41
- strip unless config[:allow_tags].include?(name)
145
+ def remove
146
+ parent.children.delete(self)
42
147
  end
43
148
 
149
+ #
150
+ # Scrubs this comment according to the given config
151
+ # If the config key :default_comment_rule is true, the comment is kept. Otherwise it's removed.
152
+ #
153
+ def scrub(config = nil)
154
+ config = Scrub::normalize_config(config)
155
+ rule = config[:default_comment_rule]
156
+ remove unless rule
157
+ return true
158
+ end
159
+
160
+ end
161
+
162
+ class Elem
163
+ include Scrub::Scrubbable
164
+
44
165
  def remove
45
166
  parent.children.delete(self)
46
167
  end
47
168
 
48
169
  def strip
49
- children.each { |c| c.strip if c.scrubable? }
170
+ swap(inner_html)
171
+ end
172
+
173
+ #
174
+ # Scrubs the element according to the given config
175
+ # The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
176
+ # The rule value dictates what happens to the element. The following logic is used:
177
+ # If the rules is false, the element is removed
178
+ # If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
179
+ # Otherwise the element is kept
180
+ #
181
+ # If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
182
+ #
183
+ # After the above is done, scrub_attributes is called if the element was kept. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
184
+ #
185
+ # This is recursive and will do all the above to all the children of the element as well.
186
+ #
187
+ def scrub(config = nil)
50
188
 
51
- if strip_removes?
189
+ config = Scrub::normalize_config(config)
190
+
191
+ children.reverse.each do |child|
192
+ child.scrub(config) if child.scrubbable?
193
+ end
194
+
195
+ rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
196
+
197
+ if !rule
52
198
  remove
199
+ elsif rule == :strip
200
+ strip
53
201
  else
54
- parent.replace_child self, Hpricot.make(inner_html) unless parent.nil?
202
+ # Positive rule
203
+ # Keep the element
204
+ # On to attributes
205
+ scrub_attributes(rule == true ? config[:default_attribute_rule] : rule)
55
206
  end
207
+
208
+ return self
56
209
  end
57
-
58
- def strip_attributes(safe=[])
59
- attributes.each {|atr|
60
- remove_attribute(atr[0]) unless safe.include?(atr[0])
61
- } unless attributes.nil?
210
+
211
+ #
212
+ # Loops over all the attributes on this element, and removes any which Hpricot::Scrub.keep_attribute? returns false for
213
+ #
214
+ def scrub_attributes(attribute_rule = nil)
215
+ if attributes
216
+ attributes.each do |key, value|
217
+ remove_attribute(key) unless Scrub.keep_attribute?(self, key, value, attribute_rule)
218
+ end
219
+ end
220
+ return true
62
221
  end
63
222
 
64
- def strip_removes?
65
- # TODO: find other elements that should be removed instead of stripped
66
- attributes && attributes['type'] =~ /script|css/
67
- end
68
- end
223
+ end #class Elem
69
224
 
70
225
  class Doc
71
- def scrub(config={})
72
- config = {
73
- :remove_tags => [],
74
- :allow_tags => [],
75
- :allow_attributes => []
76
- }.merge(config)
77
-
78
- config[:remove_tags].each { |tag| (self/tag).remove }
79
- config[:allow_tags].each { |tag|
80
- (self/tag).strip_attributes(config[:allow_attributes])
81
- }
82
- children.reverse.each {|c| c.scrub(config) if c.scrubable? }
83
- self
226
+
227
+ #
228
+ # Scrubs the Hpricot document by removing certain elements and attributes
229
+ # according to the passed-in config
230
+ # WARNING: This is destructive. If you want to keep your document untouched use a duplicate copy
231
+ # See the documentation on Hpricot::Elem#scrub for documentation of config
232
+ #
233
+ def scrub(config=nil)
234
+ config = Scrub::normalize_config(config)
235
+ children.reverse.each do |child|
236
+ child.scrub(config) if child.scrubbable?
237
+ end
238
+ return self
84
239
  end
85
- end
240
+
241
+ end #class Doc
242
+
86
243
  end
87
244
 
88
245
  class String
89
- def scrub!
90
- self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub.inner_html)
246
+ def scrub!(config=nil)
247
+ self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub(config).inner_html)
91
248
  end
92
249
 
93
- def scrub
94
- dup.scrub!
250
+ def scrub(config=nil)
251
+ dup.scrub!(config)
95
252
  end
96
253
  end
97
254
 
@@ -116,4 +273,5 @@ begin
116
273
  dup.decode!
117
274
  end
118
275
  end
276
+
119
277
  rescue LoadError; end
@@ -1,8 +1,8 @@
1
1
  module HpricotScrub #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 2
5
- TINY = 3
4
+ MINOR = 3
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -1,84 +1,128 @@
1
+ #
2
+ # This test case tests HpricotScrub features that were introduced in version 0.3.0
3
+ # introduction of more fine-grained filtering
4
+ #
1
5
  require File.dirname(__FILE__) + '/test_helper.rb'
2
6
  require File.dirname(__FILE__) + '/scrubber_data.rb'
7
+ require "uri"
3
8
 
4
9
  class HpricotScrubTest < Test::Unit::TestCase
5
10
 
6
11
  def setup
7
- @clean = Hpricot(MARKUP).scrub.inner_html
8
- @config = YAML.load_file('examples/config.yml')
9
12
 
10
- # add some tags that most users will probably want
11
- @config_full = @config.dup
12
- %w(body head html).each { |x| @config_full[:allow_tags].push(x) }
13
+ config = {
14
+ :elem_rules => {
15
+ "a" => {
16
+ "href" => %r|^https?://|i
17
+ },
18
+ "b" => true,
19
+ "body" => {
20
+ "lang" => %w(en es fr)
21
+ },
22
+ "br" => true,
23
+ "div" => %w(id class style),
24
+ "hr" => true,
25
+ "html" => true,
26
+ "img" => {
27
+ "src" => Proc.new do |parent_element, attribute_key, attribute_value|
28
+ begin
29
+ uri = URI.parse(attribute_value)
30
+ uri.is_a?(URI::HTTP) && uri.host != /imageshack/i
31
+ rescue
32
+ false
33
+ end
34
+ end,
35
+ "align" => "middle",
36
+ "alt" => true
37
+ },
38
+ "marquee" => :strip,
39
+ "p" => true,
40
+ "script" => false,
41
+ "span" => :strip,
42
+ "strong" => true,
43
+ "style" => false
44
+ },
45
+ :default_elem_rule => :strip,
46
+ :default_comment_rule => false,
47
+ :default_attribute_rule => false
48
+ }
49
+
50
+ @docs = [
51
+ Hpricot(MARKUP),
52
+ Hpricot(GOOGLE)
53
+ ]
54
+ @scrubbed_docs = [
55
+ Hpricot(MARKUP).scrub(config),
56
+ Hpricot(GOOGLE).scrub(config)
57
+ ]
58
+
13
59
  end
14
60
 
15
- def test_full_markup_partial_scrub
16
- full = Hpricot(MARKUP)
17
- full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
18
- doc = Hpricot(full_markup).scrub(@config_full)
19
- partial_scrub_common(doc, full)
61
+ def test_elem_default_rule_strips
62
+ @scrubbed_docs.each do |doc|
63
+ assert_equal 0, doc.search("//span").length
64
+ end
20
65
  end
21
66
 
22
- def test_full_scrub
23
- doc = Hpricot(MARKUP).scrub
24
- # using the divisor search throws warnings in test
25
- assert_tag_count(doc, 'a', 0)
26
- assert_tag_count(doc, 'p', 0)
27
- assert_tag_count(doc, 'img', 0)
28
- assert_tag_count(doc, 'br', 0)
29
- assert_tag_count(doc, 'div', 0)
30
- assert_tag_count(doc, 'script', 0)
67
+ def test_elem_rule_keep
68
+ @scrubbed_docs.each_with_index do |doc, i|
69
+ assert_equal @docs[i].search("//a").length, doc.search("//a").length
70
+ assert_equal @docs[i].search("//b").length, doc.search("//b").length
71
+ assert_equal @docs[i].search("//img").length, doc.search("//img").length
72
+ end
31
73
  end
32
74
 
33
- def test_partial_scrub
34
- full = Hpricot(MARKUP)
35
- doc = Hpricot(MARKUP).scrub(@config)
36
- partial_scrub_common(doc, full)
75
+ def test_elem_rule_remove
76
+ @scrubbed_docs.each do |doc|
77
+ assert_equal 0, doc.search("//script").length
78
+ assert_equal 0, doc.search("//style").length
79
+ end
37
80
  end
38
81
 
39
- def test_full_doc
40
- doc = Hpricot(GOOGLE).scrub
41
- assert_tag_count(doc, 'a', 0)
42
- assert_tag_count(doc, 'p', 0)
43
- assert_tag_count(doc, 'img', 0)
44
- assert_tag_count(doc, 'br', 0)
45
- assert_tag_count(doc, 'div', 0)
46
- assert_tag_count(doc, 'script', 0)
82
+ def test_elem_rule_strip
83
+ @scrubbed_docs.each do |doc|
84
+ assert_equal 0, doc.search("//marquee").length
85
+ assert_equal 0, doc.search("//span").length
86
+ end
47
87
  end
48
88
 
49
- def test_string_scrub
50
- formatted = MARKUP
51
- assert formatted.scrub == @clean
52
- assert formatted == MARKUP
89
+ def test_attr_default_rule_removes
90
+ @scrubbed_docs.each do |doc|
91
+ assert_equal 0, doc.search("*[@mce_src]").length
92
+ assert_equal 0, doc.search("*[@target]").length
93
+ end
53
94
  end
54
95
 
55
- def test_string_scrub!
56
- formatted = MARKUP
57
- assert formatted.scrub! == @clean
58
- assert formatted == @clean
96
+ def test_attr_rule_true
97
+ @scrubbed_docs.each_with_index do |doc, i|
98
+ assert_equal @docs[i].search("*[@alt]").length, doc.search("*[@alt]").length
99
+ end
59
100
  end
60
101
 
61
- def test_decoder
62
- str = 'some <a href="http://example.com/">example&nbsp;link</a> to nowhere'
63
- scrubbed_str = str.scrub
64
- assert scrubbed_str.include?('&nbsp;')
102
+ def test_attr_rule_string
103
+ @scrubbed_docs.each_with_index do |doc, i|
104
+ assert_equal @docs[i].search("img[@align='middle']").length, doc.search("img[@align]").length
105
+ end
106
+ end
65
107
 
66
- if defined?(HTMLEntities)
67
- assert ! scrubbed_str.decode.include?('&nbsp;')
108
+ def test_attr_rule_regexp
109
+ @scrubbed_docs.each_with_index do |doc, i|
110
+ assert_equal @docs[i].search("a[@href^='http']").length, doc.search("a[@href]").length
111
+ end
112
+ end
68
113
 
69
- scrubbed_str.decode!
70
- assert ! scrubbed_str.include?('&nbsp;')
71
- end
114
+ def test_attr_rule_array
115
+ @scrubbed_docs.each_with_index do |doc, i|
116
+ assert_equal @docs[i].search("div[@id]").length, doc.search("div[@id]").length
117
+ assert_equal @docs[i].search("div[@class]").length, doc.search("div[@class]").length
118
+ assert_equal @docs[i].search("div[@style]").length, doc.search("div[@style]").length
119
+ end
72
120
  end
73
121
 
74
- private
75
- def partial_scrub_common(doc, full)
76
- # using the divisor search throws warnings in test
77
- assert_tag_count(doc, 'a', 0)
78
- assert_tag_count(doc, 'p', full.search('//p').size)
79
- assert_tag_count(doc, 'div', full.search('//div').size)
80
- assert_tag_count(doc, 'img', full.search('//img').size)
81
- assert_tag_count(doc, 'br', full.search('//br').size)
82
- assert_tag_count(doc, 'script', 0)
122
+ def test_attr_rule_proc
123
+ @scrubbed_docs.each_with_index do |doc, i|
124
+ assert_equal @docs[i].search("img[@src^='http']").length, doc.search("img[@src]").length
125
+ end
83
126
  end
127
+
84
128
  end
@@ -0,0 +1,88 @@
1
+ #
2
+ # This test case tests HpricotScrub features that were present in version 0.2.3 before
3
+ # introduction of more fine-grained filtering
4
+ #
5
+ require File.dirname(__FILE__) + '/test_helper.rb'
6
+ require File.dirname(__FILE__) + '/scrubber_data.rb'
7
+
8
+ class OldHpricotScrubTest < Test::Unit::TestCase
9
+
10
+ def setup
11
+ @clean = Hpricot(MARKUP).scrub.inner_html
12
+ @config = YAML.load_file('examples/old_config.yml')
13
+
14
+ # add some tags that most users will probably want
15
+ @config_full = @config.dup
16
+ %w(body head html).each { |x| @config_full[:allow_tags].push(x) }
17
+ end
18
+
19
+ def test_full_markup_partial_scrub
20
+ full = Hpricot(MARKUP)
21
+ full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
22
+ doc = Hpricot(full_markup).scrub(@config_full)
23
+ partial_scrub_common(doc, full)
24
+ end
25
+
26
+ def test_full_scrub
27
+ doc = Hpricot(MARKUP).scrub
28
+ # using the divisor search throws warnings in test
29
+ assert_tag_count(doc, 'a', 0)
30
+ assert_tag_count(doc, 'p', 0)
31
+ assert_tag_count(doc, 'img', 0)
32
+ assert_tag_count(doc, 'br', 0)
33
+ assert_tag_count(doc, 'div', 0)
34
+ assert_tag_count(doc, 'script', 0)
35
+ end
36
+
37
+ def test_partial_scrub
38
+ full = Hpricot(MARKUP)
39
+ doc = Hpricot(MARKUP).scrub(@config)
40
+ partial_scrub_common(doc, full)
41
+ end
42
+
43
+ def test_full_doc
44
+ doc = Hpricot(GOOGLE).scrub
45
+ assert_tag_count(doc, 'a', 0)
46
+ assert_tag_count(doc, 'p', 0)
47
+ assert_tag_count(doc, 'img', 0)
48
+ assert_tag_count(doc, 'br', 0)
49
+ assert_tag_count(doc, 'div', 0)
50
+ assert_tag_count(doc, 'script', 0)
51
+ end
52
+
53
+ def test_string_scrub
54
+ formatted = MARKUP
55
+ assert formatted.scrub == @clean
56
+ assert formatted == MARKUP
57
+ end
58
+
59
+ def test_string_scrub!
60
+ formatted = MARKUP
61
+ assert formatted.scrub! == @clean
62
+ assert formatted == @clean
63
+ end
64
+
65
+ def test_decoder
66
+ str = 'some <a href="http://example.com/">example&nbsp;link</a> to nowhere'
67
+ scrubbed_str = str.scrub
68
+ assert scrubbed_str.include?('&nbsp;')
69
+
70
+ if defined?(HTMLEntities)
71
+ assert ! scrubbed_str.decode.include?('&nbsp;')
72
+
73
+ scrubbed_str.decode!
74
+ assert ! scrubbed_str.include?('&nbsp;')
75
+ end
76
+ end
77
+
78
+ private
79
+ def partial_scrub_common(doc, full)
80
+ # using the divisor search throws warnings in test
81
+ assert_tag_count(doc, 'a', 0)
82
+ assert_tag_count(doc, 'p', full.search('//p').size)
83
+ assert_tag_count(doc, 'div', full.search('//div').size)
84
+ assert_tag_count(doc, 'img', full.search('//img').size)
85
+ assert_tag_count(doc, 'br', full.search('//br').size)
86
+ assert_tag_count(doc, 'script', 0)
87
+ end
88
+ end
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2.1
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: hpricot_scrub
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.3
7
- date: 2007-04-05 00:00:00 -07:00
6
+ version: 0.3.0
7
+ date: 2008-04-22 00:00:00 -05:00
8
8
  summary: Scrub HTML with Hpricot
9
9
  require_paths:
10
10
  - lib
@@ -40,13 +40,18 @@ files:
40
40
  - test/test_helper.rb
41
41
  - test/scrubber_data.rb
42
42
  - test/hpricot_scrub_test.rb
43
- - examples/config.yml
43
+ - test/old_hpricot_scrub_test.rb
44
+ - examples/old_config.yml
44
45
  test_files:
45
46
  - test/hpricot_scrub_test.rb
46
- rdoc_options: []
47
-
48
- extra_rdoc_files: []
49
-
47
+ - test/old_hpricot_scrub_test.rb
48
+ rdoc_options:
49
+ - --main
50
+ - README.txt
51
+ extra_rdoc_files:
52
+ - README.txt
53
+ - CHANGELOG.txt
54
+ - Manifest.txt
50
55
  executables: []
51
56
 
52
57
  extensions: []
File without changes