hpricot_scrub 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,11 @@
1
+ 2008-06-03 Mina Naguib <mina.hpricotscrub@naguib.ca>
2
+ Release 0.3.1
3
+ - Allow an element rule to be a Proc (which is expected to return one of
4
+ the other non-proc rules (false/:strip/true/attr_rules). This allows the
5
+ building of highly custom filtering rules (for example filter out <B>
6
+ tags unless their parent is a <P>...)
7
+ - Slight cleanup of internal documentation
8
+
1
9
  2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
2
10
  Release 0.3.0
3
11
  Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
@@ -38,15 +38,15 @@ module Hpricot
38
38
  #
39
39
  # Merge+delete legacy config keys
40
40
  #
41
- # :remove_tags
41
+ # :remove_tags -> :elem_rules (false)
42
42
  (config.delete(:remove_tags) || []).each do |tag|
43
43
  config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
44
44
  end
45
- # :allow_tags
45
+ # :allow_tags -> :elem_rules (true)
46
46
  (config.delete(:allow_tags) || []).each do |tag|
47
47
  config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
48
48
  end
49
- # :allow_attributes
49
+ # :allow_attributes -> :default_attribute_rule (procs)
50
50
  (config.delete(:allow_attributes) || []).each do |attribute|
51
51
  #
52
52
  # Add it to the default attribute rule
@@ -174,13 +174,14 @@ module Hpricot
174
174
  # Scrubs the element according to the given config
175
175
  # The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
176
176
  # The rule value dictates what happens to the element. The following logic is used:
177
- # If the rules is false, the element is removed
177
+ # If the rule is false/nil, the element is removed along with all it's children
178
178
  # If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
179
+ # If the rule is a proc, the proc is called (and given the element itself) - the proc's expected to return a valid rule that matches this documentation
179
180
  # Otherwise the element is kept
180
181
  #
181
182
  # If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
182
183
  #
183
- # After the above is done, scrub_attributes is called if the element was kept. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
184
+ # After the above is done, if the element was kept, it's time to clean up its attributes so scrub_attributes is called. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
184
185
  #
185
186
  # This is recursive and will do all the above to all the children of the element as well.
186
187
  #
@@ -194,6 +195,10 @@ module Hpricot
194
195
 
195
196
  rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
196
197
 
198
+ while rule.is_a?(Proc)
199
+ rule = rule.call(self)
200
+ end
201
+
197
202
  if !rule
198
203
  remove
199
204
  elsif rule == :strip
@@ -2,7 +2,7 @@ module HpricotScrub #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 3
5
- TINY = 0
5
+ TINY = 1
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -15,7 +15,9 @@ class HpricotScrubTest < Test::Unit::TestCase
15
15
  "a" => {
16
16
  "href" => %r|^https?://|i
17
17
  },
18
- "b" => true,
18
+ "b" => Proc.new do |bold_element|
19
+ bold_element.parent.name == "p" ? true : :strip
20
+ end,
19
21
  "body" => {
20
22
  "lang" => %w(en es fr)
21
23
  },
@@ -67,7 +69,6 @@ class HpricotScrubTest < Test::Unit::TestCase
67
69
  def test_elem_rule_keep
68
70
  @scrubbed_docs.each_with_index do |doc, i|
69
71
  assert_equal @docs[i].search("//a").length, doc.search("//a").length
70
- assert_equal @docs[i].search("//b").length, doc.search("//b").length
71
72
  assert_equal @docs[i].search("//img").length, doc.search("//img").length
72
73
  end
73
74
  end
@@ -86,6 +87,12 @@ class HpricotScrubTest < Test::Unit::TestCase
86
87
  end
87
88
  end
88
89
 
90
+ def test_elem_rule_proc
91
+ @scrubbed_docs.each_with_index do |doc, i|
92
+ assert_equal @docs[i].search("//p/b").length, doc.search("//b").length
93
+ end
94
+ end
95
+
89
96
  def test_attr_default_rule_removes
90
97
  @scrubbed_docs.each do |doc|
91
98
  assert_equal 0, doc.search("*[@mce_src]").length
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: hpricot_scrub
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2008-04-22 00:00:00 -05:00
6
+ version: 0.3.1
7
+ date: 2008-06-04 00:00:00 -05:00
8
8
  summary: Scrub HTML with Hpricot
9
9
  require_paths:
10
10
  - lib
@@ -64,7 +64,16 @@ dependencies:
64
64
  version_requirement:
65
65
  version_requirements: !ruby/object:Gem::Version::Requirement
66
66
  requirements:
67
- - - ">="
67
+ - - ">"
68
68
  - !ruby/object:Gem::Version
69
- version: "0.5"
69
+ version: 0.0.0
70
+ version:
71
+ - !ruby/object:Gem::Dependency
72
+ name: ">= 0.5"
73
+ version_requirement:
74
+ version_requirements: !ruby/object:Gem::Version::Requirement
75
+ requirements:
76
+ - - ">"
77
+ - !ruby/object:Gem::Version
78
+ version: 0.0.0
70
79
  version: