hpricot_scrub 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.txt CHANGED
@@ -1,3 +1,12 @@
1
+ 2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
2
+ Release 0.3.0
3
+ Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
4
+ functionality:
5
+ - Deprecate config keys :allow_tags, :remove_tags and :allow_attributes
6
+ - Introduce config keys :elem_rules, :default_elem_rule,
7
+ :default_comment_rule and :default_attribute_rule
8
+ - Document the above (inline - visible in rdoc & the likes)
9
+
1
10
  2007-04-05 Michael <michael@underpantsgnome.com>
2
11
  Release 0.2.3
3
12
  Add patches from Eric Wong
data/Manifest.txt CHANGED
@@ -9,4 +9,5 @@ lib/hpricot_scrub.rb
9
9
  test/test_helper.rb
10
10
  test/scrubber_data.rb
11
11
  test/hpricot_scrub_test.rb
12
- examples/config.yml
12
+ test/old_hpricot_scrub_test.rb
13
+ examples/old_config.yml
@@ -9,14 +9,120 @@ end
9
9
  require 'hpricot'
10
10
 
11
11
  module Hpricot
12
- module Scrubable
13
- # TODO: figure out how to handle comments
14
- def scrubable?
15
- ! [ Hpricot::Text,
16
- Hpricot::BogusETag,
17
- Hpricot::Comment
18
- ].include?(self.class) && self.respond_to?(:scrub)
12
+
13
+ class Scrub
14
+
15
+ def self.normalize_config(config) #:nodoc:#
16
+ config = {} unless config.is_a?(Hash)
17
+
18
+ return config if config[:normalized]
19
+
20
+ config = {
21
+
22
+ # Legacy config keys:
23
+ :remove_tags => [],
24
+ :allow_tags => [],
25
+ :allow_attributes => [],
26
+
27
+ # New fine-grained hotness:
28
+ :elem_rules => {
29
+ "script" => false,
30
+ "style" => false
31
+ },
32
+ :default_elem_rule => :strip,
33
+ :default_comment_rule => false,
34
+ :default_attribute_rule => false
35
+
36
+ }.merge(config)
37
+
38
+ #
39
+ # Merge+delete legacy config keys
40
+ #
41
+ # :remove_tags
42
+ (config.delete(:remove_tags) || []).each do |tag|
43
+ config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
44
+ end
45
+ # :allow_tags
46
+ (config.delete(:allow_tags) || []).each do |tag|
47
+ config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
48
+ end
49
+ # :allow_attributes
50
+ (config.delete(:allow_attributes) || []).each do |attribute|
51
+ #
52
+ # Add it to the default attribute rule
53
+ #
54
+ old_rule = config[:default_attribute_rule]
55
+ config[:default_attribute_rule] = Proc.new do |parent_element, key, value|
56
+ if key == attribute
57
+ true
58
+ else
59
+ Scrub::keep_attribute?(parent_element, key, value, old_rule)
60
+ end
61
+ end
62
+ end
63
+
64
+ config[:normalized] = true
65
+ return config
66
+ end
67
+
68
+ #
69
+ # Takes:
70
+ #
71
+ # An element
72
+ # An attribute key found in that element
73
+ # The attribute value attached to the key
74
+ # An attribute rule
75
+ #
76
+ # Checks the rule aginst the attribute and returns:
77
+ #
78
+ # true = the attribute should be kept
79
+ # false = the attribute should NOT be kept
80
+ #
81
+ # Acceptable attribute rules are:
82
+ #
83
+ # true: Keep the attribute without inspection
84
+ # a String: Attribute value must be the same as the string
85
+ # an Array: Attribute key must exist in the array
86
+ # a Regexp: Attribute value must match the regexp
87
+ # a Hash: The attribute key is found in the hash, and the value is considered a new rule and follows these same rules via recursion
88
+ # a Proc: The Proc is called with arguments (parent_element, key, value), the returned value is considered a new rule and follows these same rules via recursion
89
+ # otherwise: Remove the attribute
90
+ #
91
+ def self.keep_attribute?(parent_element, key, value, attribute_rule)
92
+
93
+ if attribute_rule == true
94
+ keep = true
95
+ elsif attribute_rule.is_a?(String)
96
+ keep = (attribute_rule == value)
97
+ elsif attribute_rule.is_a?(Array)
98
+ keep = attribute_rule.include?(key)
99
+ elsif attribute_rule.is_a?(Regexp)
100
+ keep = attribute_rule.match(value)
101
+ elsif attribute_rule.is_a?(Hash)
102
+ # Allow hash value to be new rule via recursion
103
+ new_rule = attribute_rule[key]
104
+ keep = keep_attribute?(parent_element, key, value, new_rule)
105
+ elsif attribute_rule.is_a?(Proc)
106
+ # Allow the proc to return a new rule - recurse:
107
+ new_rule = attribute_rule.call(parent_element, key, value)
108
+ keep = keep_attribute?(parent_element, key, value, new_rule)
109
+ else
110
+ # Err on the side of caution
111
+ keep = false
112
+ end
113
+
114
+ return keep
115
+
19
116
  end
117
+
118
+ module Scrubbable
119
+ def scrubbable?
120
+ ! [ Hpricot::Text,
121
+ Hpricot::BogusETag,
122
+ ].include?(self.class) && self.respond_to?(:scrub)
123
+ end
124
+ end
125
+
20
126
  end
21
127
 
22
128
  class Elements
@@ -25,73 +131,124 @@ module Hpricot
25
131
  end
26
132
 
27
133
  def strip_attributes(safe=[])
28
- each { |x| x.strip_attributes(safe) }
134
+ each { |x| x.scrub_attributes(safe) }
29
135
  end
30
136
  end
31
137
 
32
138
  class BaseEle
33
- include Scrubable
139
+ include Scrub::Scrubbable
34
140
  end
35
141
 
36
- class Elem
37
- include Scrubable
142
+ class Comment
143
+ include Scrub::Scrubbable
38
144
 
39
- def scrub(config)
40
- children.reverse.each { |c| c.scrub(config) if c.scrubable? }
41
- strip unless config[:allow_tags].include?(name)
145
+ def remove
146
+ parent.children.delete(self)
42
147
  end
43
148
 
149
+ #
150
+ # Scrubs this comment according to the given config
151
+ # If the config key :default_comment_rule is true, the comment is kept. Otherwise it's removed.
152
+ #
153
+ def scrub(config = nil)
154
+ config = Scrub::normalize_config(config)
155
+ rule = config[:default_comment_rule]
156
+ remove unless rule
157
+ return true
158
+ end
159
+
160
+ end
161
+
162
+ class Elem
163
+ include Scrub::Scrubbable
164
+
44
165
  def remove
45
166
  parent.children.delete(self)
46
167
  end
47
168
 
48
169
  def strip
49
- children.each { |c| c.strip if c.scrubable? }
170
+ swap(inner_html)
171
+ end
172
+
173
+ #
174
+ # Scrubs the element according to the given config
175
+ # The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
176
+ # The rule value dictates what happens to the element. The following logic is used:
177
+ # If the rules is false, the element is removed
178
+ # If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
179
+ # Otherwise the element is kept
180
+ #
181
+ # If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
182
+ #
183
+ # After the above is done, scrub_attributes is called if the element was kept. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
184
+ #
185
+ # This is recursive and will do all the above to all the children of the element as well.
186
+ #
187
+ def scrub(config = nil)
50
188
 
51
- if strip_removes?
189
+ config = Scrub::normalize_config(config)
190
+
191
+ children.reverse.each do |child|
192
+ child.scrub(config) if child.scrubbable?
193
+ end
194
+
195
+ rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
196
+
197
+ if !rule
52
198
  remove
199
+ elsif rule == :strip
200
+ strip
53
201
  else
54
- parent.replace_child self, Hpricot.make(inner_html) unless parent.nil?
202
+ # Positive rule
203
+ # Keep the element
204
+ # On to attributes
205
+ scrub_attributes(rule == true ? config[:default_attribute_rule] : rule)
55
206
  end
207
+
208
+ return self
56
209
  end
57
-
58
- def strip_attributes(safe=[])
59
- attributes.each {|atr|
60
- remove_attribute(atr[0]) unless safe.include?(atr[0])
61
- } unless attributes.nil?
210
+
211
+ #
212
+ # Loops over all the attributes on this element, and removes any which Hpricot::Scrub.keep_attribute? returns false for
213
+ #
214
+ def scrub_attributes(attribute_rule = nil)
215
+ if attributes
216
+ attributes.each do |key, value|
217
+ remove_attribute(key) unless Scrub.keep_attribute?(self, key, value, attribute_rule)
218
+ end
219
+ end
220
+ return true
62
221
  end
63
222
 
64
- def strip_removes?
65
- # TODO: find other elements that should be removed instead of stripped
66
- attributes && attributes['type'] =~ /script|css/
67
- end
68
- end
223
+ end #class Elem
69
224
 
70
225
  class Doc
71
- def scrub(config={})
72
- config = {
73
- :remove_tags => [],
74
- :allow_tags => [],
75
- :allow_attributes => []
76
- }.merge(config)
77
-
78
- config[:remove_tags].each { |tag| (self/tag).remove }
79
- config[:allow_tags].each { |tag|
80
- (self/tag).strip_attributes(config[:allow_attributes])
81
- }
82
- children.reverse.each {|c| c.scrub(config) if c.scrubable? }
83
- self
226
+
227
+ #
228
+ # Scrubs the Hpricot document by removing certain elements and attributes
229
+ # according to the passed-in config
230
+ # WARNING: This is destructive. If you want to keep your document untouched use a duplicate copy
231
+ # See the documentation on Hpricot::Elem#scrub for documentation of config
232
+ #
233
+ def scrub(config=nil)
234
+ config = Scrub::normalize_config(config)
235
+ children.reverse.each do |child|
236
+ child.scrub(config) if child.scrubbable?
237
+ end
238
+ return self
84
239
  end
85
- end
240
+
241
+ end #class Doc
242
+
86
243
  end
87
244
 
88
245
  class String
89
- def scrub!
90
- self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub.inner_html)
246
+ def scrub!(config=nil)
247
+ self.gsub!(/^(\n|.)*$/, Hpricot(self).scrub(config).inner_html)
91
248
  end
92
249
 
93
- def scrub
94
- dup.scrub!
250
+ def scrub(config=nil)
251
+ dup.scrub!(config)
95
252
  end
96
253
  end
97
254
 
@@ -116,4 +273,5 @@ begin
116
273
  dup.decode!
117
274
  end
118
275
  end
276
+
119
277
  rescue LoadError; end
@@ -1,8 +1,8 @@
1
1
  module HpricotScrub #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 2
5
- TINY = 3
4
+ MINOR = 3
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -1,84 +1,128 @@
1
+ #
2
+ # This test case tests HpricotScrub features that were introduced in version 0.3.0
3
+ # introduction of more fine-grained filtering
4
+ #
1
5
  require File.dirname(__FILE__) + '/test_helper.rb'
2
6
  require File.dirname(__FILE__) + '/scrubber_data.rb'
7
+ require "uri"
3
8
 
4
9
  class HpricotScrubTest < Test::Unit::TestCase
5
10
 
6
11
  def setup
7
- @clean = Hpricot(MARKUP).scrub.inner_html
8
- @config = YAML.load_file('examples/config.yml')
9
12
 
10
- # add some tags that most users will probably want
11
- @config_full = @config.dup
12
- %w(body head html).each { |x| @config_full[:allow_tags].push(x) }
13
+ config = {
14
+ :elem_rules => {
15
+ "a" => {
16
+ "href" => %r|^https?://|i
17
+ },
18
+ "b" => true,
19
+ "body" => {
20
+ "lang" => %w(en es fr)
21
+ },
22
+ "br" => true,
23
+ "div" => %w(id class style),
24
+ "hr" => true,
25
+ "html" => true,
26
+ "img" => {
27
+ "src" => Proc.new do |parent_element, attribute_key, attribute_value|
28
+ begin
29
+ uri = URI.parse(attribute_value)
30
+ uri.is_a?(URI::HTTP) && uri.host != /imageshack/i
31
+ rescue
32
+ false
33
+ end
34
+ end,
35
+ "align" => "middle",
36
+ "alt" => true
37
+ },
38
+ "marquee" => :strip,
39
+ "p" => true,
40
+ "script" => false,
41
+ "span" => :strip,
42
+ "strong" => true,
43
+ "style" => false
44
+ },
45
+ :default_elem_rule => :strip,
46
+ :default_comment_rule => false,
47
+ :default_attribute_rule => false
48
+ }
49
+
50
+ @docs = [
51
+ Hpricot(MARKUP),
52
+ Hpricot(GOOGLE)
53
+ ]
54
+ @scrubbed_docs = [
55
+ Hpricot(MARKUP).scrub(config),
56
+ Hpricot(GOOGLE).scrub(config)
57
+ ]
58
+
13
59
  end
14
60
 
15
- def test_full_markup_partial_scrub
16
- full = Hpricot(MARKUP)
17
- full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
18
- doc = Hpricot(full_markup).scrub(@config_full)
19
- partial_scrub_common(doc, full)
61
+ def test_elem_default_rule_strips
62
+ @scrubbed_docs.each do |doc|
63
+ assert_equal 0, doc.search("//span").length
64
+ end
20
65
  end
21
66
 
22
- def test_full_scrub
23
- doc = Hpricot(MARKUP).scrub
24
- # using the divisor search throws warnings in test
25
- assert_tag_count(doc, 'a', 0)
26
- assert_tag_count(doc, 'p', 0)
27
- assert_tag_count(doc, 'img', 0)
28
- assert_tag_count(doc, 'br', 0)
29
- assert_tag_count(doc, 'div', 0)
30
- assert_tag_count(doc, 'script', 0)
67
+ def test_elem_rule_keep
68
+ @scrubbed_docs.each_with_index do |doc, i|
69
+ assert_equal @docs[i].search("//a").length, doc.search("//a").length
70
+ assert_equal @docs[i].search("//b").length, doc.search("//b").length
71
+ assert_equal @docs[i].search("//img").length, doc.search("//img").length
72
+ end
31
73
  end
32
74
 
33
- def test_partial_scrub
34
- full = Hpricot(MARKUP)
35
- doc = Hpricot(MARKUP).scrub(@config)
36
- partial_scrub_common(doc, full)
75
+ def test_elem_rule_remove
76
+ @scrubbed_docs.each do |doc|
77
+ assert_equal 0, doc.search("//script").length
78
+ assert_equal 0, doc.search("//style").length
79
+ end
37
80
  end
38
81
 
39
- def test_full_doc
40
- doc = Hpricot(GOOGLE).scrub
41
- assert_tag_count(doc, 'a', 0)
42
- assert_tag_count(doc, 'p', 0)
43
- assert_tag_count(doc, 'img', 0)
44
- assert_tag_count(doc, 'br', 0)
45
- assert_tag_count(doc, 'div', 0)
46
- assert_tag_count(doc, 'script', 0)
82
+ def test_elem_rule_strip
83
+ @scrubbed_docs.each do |doc|
84
+ assert_equal 0, doc.search("//marquee").length
85
+ assert_equal 0, doc.search("//span").length
86
+ end
47
87
  end
48
88
 
49
- def test_string_scrub
50
- formatted = MARKUP
51
- assert formatted.scrub == @clean
52
- assert formatted == MARKUP
89
+ def test_attr_default_rule_removes
90
+ @scrubbed_docs.each do |doc|
91
+ assert_equal 0, doc.search("*[@mce_src]").length
92
+ assert_equal 0, doc.search("*[@target]").length
93
+ end
53
94
  end
54
95
 
55
- def test_string_scrub!
56
- formatted = MARKUP
57
- assert formatted.scrub! == @clean
58
- assert formatted == @clean
96
+ def test_attr_rule_true
97
+ @scrubbed_docs.each_with_index do |doc, i|
98
+ assert_equal @docs[i].search("*[@alt]").length, doc.search("*[@alt]").length
99
+ end
59
100
  end
60
101
 
61
- def test_decoder
62
- str = 'some <a href="http://example.com/">example&nbsp;link</a> to nowhere'
63
- scrubbed_str = str.scrub
64
- assert scrubbed_str.include?('&nbsp;')
102
+ def test_attr_rule_string
103
+ @scrubbed_docs.each_with_index do |doc, i|
104
+ assert_equal @docs[i].search("img[@align='middle']").length, doc.search("img[@align]").length
105
+ end
106
+ end
65
107
 
66
- if defined?(HTMLEntities)
67
- assert ! scrubbed_str.decode.include?('&nbsp;')
108
+ def test_attr_rule_regexp
109
+ @scrubbed_docs.each_with_index do |doc, i|
110
+ assert_equal @docs[i].search("a[@href^='http']").length, doc.search("a[@href]").length
111
+ end
112
+ end
68
113
 
69
- scrubbed_str.decode!
70
- assert ! scrubbed_str.include?('&nbsp;')
71
- end
114
+ def test_attr_rule_array
115
+ @scrubbed_docs.each_with_index do |doc, i|
116
+ assert_equal @docs[i].search("div[@id]").length, doc.search("div[@id]").length
117
+ assert_equal @docs[i].search("div[@class]").length, doc.search("div[@class]").length
118
+ assert_equal @docs[i].search("div[@style]").length, doc.search("div[@style]").length
119
+ end
72
120
  end
73
121
 
74
- private
75
- def partial_scrub_common(doc, full)
76
- # using the divisor search throws warnings in test
77
- assert_tag_count(doc, 'a', 0)
78
- assert_tag_count(doc, 'p', full.search('//p').size)
79
- assert_tag_count(doc, 'div', full.search('//div').size)
80
- assert_tag_count(doc, 'img', full.search('//img').size)
81
- assert_tag_count(doc, 'br', full.search('//br').size)
82
- assert_tag_count(doc, 'script', 0)
122
+ def test_attr_rule_proc
123
+ @scrubbed_docs.each_with_index do |doc, i|
124
+ assert_equal @docs[i].search("img[@src^='http']").length, doc.search("img[@src]").length
125
+ end
83
126
  end
127
+
84
128
  end
@@ -0,0 +1,88 @@
1
+ #
2
+ # This test case tests HpricotScrub features that were present in version 0.2.3 before
3
+ # introduction of more fine-grained filtering
4
+ #
5
+ require File.dirname(__FILE__) + '/test_helper.rb'
6
+ require File.dirname(__FILE__) + '/scrubber_data.rb'
7
+
8
+ class OldHpricotScrubTest < Test::Unit::TestCase
9
+
10
+ def setup
11
+ @clean = Hpricot(MARKUP).scrub.inner_html
12
+ @config = YAML.load_file('examples/old_config.yml')
13
+
14
+ # add some tags that most users will probably want
15
+ @config_full = @config.dup
16
+ %w(body head html).each { |x| @config_full[:allow_tags].push(x) }
17
+ end
18
+
19
+ def test_full_markup_partial_scrub
20
+ full = Hpricot(MARKUP)
21
+ full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
22
+ doc = Hpricot(full_markup).scrub(@config_full)
23
+ partial_scrub_common(doc, full)
24
+ end
25
+
26
+ def test_full_scrub
27
+ doc = Hpricot(MARKUP).scrub
28
+ # using the divisor search throws warnings in test
29
+ assert_tag_count(doc, 'a', 0)
30
+ assert_tag_count(doc, 'p', 0)
31
+ assert_tag_count(doc, 'img', 0)
32
+ assert_tag_count(doc, 'br', 0)
33
+ assert_tag_count(doc, 'div', 0)
34
+ assert_tag_count(doc, 'script', 0)
35
+ end
36
+
37
+ def test_partial_scrub
38
+ full = Hpricot(MARKUP)
39
+ doc = Hpricot(MARKUP).scrub(@config)
40
+ partial_scrub_common(doc, full)
41
+ end
42
+
43
+ def test_full_doc
44
+ doc = Hpricot(GOOGLE).scrub
45
+ assert_tag_count(doc, 'a', 0)
46
+ assert_tag_count(doc, 'p', 0)
47
+ assert_tag_count(doc, 'img', 0)
48
+ assert_tag_count(doc, 'br', 0)
49
+ assert_tag_count(doc, 'div', 0)
50
+ assert_tag_count(doc, 'script', 0)
51
+ end
52
+
53
+ def test_string_scrub
54
+ formatted = MARKUP
55
+ assert formatted.scrub == @clean
56
+ assert formatted == MARKUP
57
+ end
58
+
59
+ def test_string_scrub!
60
+ formatted = MARKUP
61
+ assert formatted.scrub! == @clean
62
+ assert formatted == @clean
63
+ end
64
+
65
+ def test_decoder
66
+ str = 'some <a href="http://example.com/">example&nbsp;link</a> to nowhere'
67
+ scrubbed_str = str.scrub
68
+ assert scrubbed_str.include?('&nbsp;')
69
+
70
+ if defined?(HTMLEntities)
71
+ assert ! scrubbed_str.decode.include?('&nbsp;')
72
+
73
+ scrubbed_str.decode!
74
+ assert ! scrubbed_str.include?('&nbsp;')
75
+ end
76
+ end
77
+
78
+ private
79
+ def partial_scrub_common(doc, full)
80
+ # using the divisor search throws warnings in test
81
+ assert_tag_count(doc, 'a', 0)
82
+ assert_tag_count(doc, 'p', full.search('//p').size)
83
+ assert_tag_count(doc, 'div', full.search('//div').size)
84
+ assert_tag_count(doc, 'img', full.search('//img').size)
85
+ assert_tag_count(doc, 'br', full.search('//br').size)
86
+ assert_tag_count(doc, 'script', 0)
87
+ end
88
+ end
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2.1
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: hpricot_scrub
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.3
7
- date: 2007-04-05 00:00:00 -07:00
6
+ version: 0.3.0
7
+ date: 2008-04-22 00:00:00 -05:00
8
8
  summary: Scrub HTML with Hpricot
9
9
  require_paths:
10
10
  - lib
@@ -40,13 +40,18 @@ files:
40
40
  - test/test_helper.rb
41
41
  - test/scrubber_data.rb
42
42
  - test/hpricot_scrub_test.rb
43
- - examples/config.yml
43
+ - test/old_hpricot_scrub_test.rb
44
+ - examples/old_config.yml
44
45
  test_files:
45
46
  - test/hpricot_scrub_test.rb
46
- rdoc_options: []
47
-
48
- extra_rdoc_files: []
49
-
47
+ - test/old_hpricot_scrub_test.rb
48
+ rdoc_options:
49
+ - --main
50
+ - README.txt
51
+ extra_rdoc_files:
52
+ - README.txt
53
+ - CHANGELOG.txt
54
+ - Manifest.txt
50
55
  executables: []
51
56
 
52
57
  extensions: []
File without changes