hpricot_scrub 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,11 @@
1
+ 2008-06-03 Mina Naguib <mina.hpricotscrub@naguib.ca>
2
+ Release 0.3.1
3
+ - Allow an element rule to be a Proc (which is expected to return one of
4
+ the other non-proc rules (false/:strip/true/attr_rules). This allows the
5
+ building of highly custom filtering rules (for example filter out <B>
6
+ tags unless their parent is a <P>...)
7
+ - Slight cleanup of internal documentation
8
+
1
9
  2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
2
10
  Release 0.3.0
3
11
  Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
@@ -38,15 +38,15 @@ module Hpricot
38
38
  #
39
39
  # Merge+delete legacy config keys
40
40
  #
41
- # :remove_tags
41
+ # :remove_tags -> :elem_rules (false)
42
42
  (config.delete(:remove_tags) || []).each do |tag|
43
43
  config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
44
44
  end
45
- # :allow_tags
45
+ # :allow_tags -> :elem_rules (true)
46
46
  (config.delete(:allow_tags) || []).each do |tag|
47
47
  config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
48
48
  end
49
- # :allow_attributes
49
+ # :allow_attributes -> :default_attribute_rule (procs)
50
50
  (config.delete(:allow_attributes) || []).each do |attribute|
51
51
  #
52
52
  # Add it to the default attribute rule
@@ -174,13 +174,14 @@ module Hpricot
174
174
  # Scrubs the element according to the given config
175
175
  # The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
176
176
  # The rule value dictates what happens to the element. The following logic is used:
177
- # If the rules is false, the element is removed
177
+ # If the rule is false/nil, the element is removed along with all it's children
178
178
  # If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
179
+ # If the rule is a proc, the proc is called (and given the element itself) - the proc's expected to return a valid rule that matches this documentation
179
180
  # Otherwise the element is kept
180
181
  #
181
182
  # If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
182
183
  #
183
- # After the above is done, scrub_attributes is called if the element was kept. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
184
+ # After the above is done, if the element was kept, it's time to clean up its attributes so scrub_attributes is called. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
184
185
  #
185
186
  # This is recursive and will do all the above to all the children of the element as well.
186
187
  #
@@ -194,6 +195,10 @@ module Hpricot
194
195
 
195
196
  rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
196
197
 
198
+ while rule.is_a?(Proc)
199
+ rule = rule.call(self)
200
+ end
201
+
197
202
  if !rule
198
203
  remove
199
204
  elsif rule == :strip
@@ -2,7 +2,7 @@ module HpricotScrub #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 3
5
- TINY = 0
5
+ TINY = 1
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -15,7 +15,9 @@ class HpricotScrubTest < Test::Unit::TestCase
15
15
  "a" => {
16
16
  "href" => %r|^https?://|i
17
17
  },
18
- "b" => true,
18
+ "b" => Proc.new do |bold_element|
19
+ bold_element.parent.name == "p" ? true : :strip
20
+ end,
19
21
  "body" => {
20
22
  "lang" => %w(en es fr)
21
23
  },
@@ -67,7 +69,6 @@ class HpricotScrubTest < Test::Unit::TestCase
67
69
  def test_elem_rule_keep
68
70
  @scrubbed_docs.each_with_index do |doc, i|
69
71
  assert_equal @docs[i].search("//a").length, doc.search("//a").length
70
- assert_equal @docs[i].search("//b").length, doc.search("//b").length
71
72
  assert_equal @docs[i].search("//img").length, doc.search("//img").length
72
73
  end
73
74
  end
@@ -86,6 +87,12 @@ class HpricotScrubTest < Test::Unit::TestCase
86
87
  end
87
88
  end
88
89
 
90
+ def test_elem_rule_proc
91
+ @scrubbed_docs.each_with_index do |doc, i|
92
+ assert_equal @docs[i].search("//p/b").length, doc.search("//b").length
93
+ end
94
+ end
95
+
89
96
  def test_attr_default_rule_removes
90
97
  @scrubbed_docs.each do |doc|
91
98
  assert_equal 0, doc.search("*[@mce_src]").length
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: hpricot_scrub
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2008-04-22 00:00:00 -05:00
6
+ version: 0.3.1
7
+ date: 2008-06-04 00:00:00 -05:00
8
8
  summary: Scrub HTML with Hpricot
9
9
  require_paths:
10
10
  - lib
@@ -64,7 +64,16 @@ dependencies:
64
64
  version_requirement:
65
65
  version_requirements: !ruby/object:Gem::Version::Requirement
66
66
  requirements:
67
- - - ">="
67
+ - - ">"
68
68
  - !ruby/object:Gem::Version
69
- version: "0.5"
69
+ version: 0.0.0
70
+ version:
71
+ - !ruby/object:Gem::Dependency
72
+ name: ">= 0.5"
73
+ version_requirement:
74
+ version_requirements: !ruby/object:Gem::Version::Requirement
75
+ requirements:
76
+ - - ">"
77
+ - !ruby/object:Gem::Version
78
+ version: 0.0.0
70
79
  version: