hpricot_scrub 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.txt +8 -0
- data/lib/hpricot_scrub/hpricot_scrub.rb +10 -5
- data/lib/hpricot_scrub/version.rb +1 -1
- data/test/hpricot_scrub_test.rb +9 -2
- metadata +13 -4
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
2008-06-03 Mina Naguib <mina.hpricotscrub@naguib.ca>
|
2
|
+
Release 0.3.1
|
3
|
+
- Allow an element rule to be a Proc (which is expected to return one of
|
4
|
+
the other non-proc rules (false/:strip/true/attr_rules). This allows the
|
5
|
+
building of highly custom filtering rules (for example filter out <B>
|
6
|
+
tags unless their parent is a <P>...)
|
7
|
+
- Slight cleanup of internal documentation
|
8
|
+
|
1
9
|
2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
|
2
10
|
Release 0.3.0
|
3
11
|
Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
|
@@ -38,15 +38,15 @@ module Hpricot
|
|
38
38
|
#
|
39
39
|
# Merge+delete legacy config keys
|
40
40
|
#
|
41
|
-
# :remove_tags
|
41
|
+
# :remove_tags -> :elem_rules (false)
|
42
42
|
(config.delete(:remove_tags) || []).each do |tag|
|
43
43
|
config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
|
44
44
|
end
|
45
|
-
# :allow_tags
|
45
|
+
# :allow_tags -> :elem_rules (true)
|
46
46
|
(config.delete(:allow_tags) || []).each do |tag|
|
47
47
|
config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
|
48
48
|
end
|
49
|
-
# :allow_attributes
|
49
|
+
# :allow_attributes -> :default_attribute_rule (procs)
|
50
50
|
(config.delete(:allow_attributes) || []).each do |attribute|
|
51
51
|
#
|
52
52
|
# Add it to the default attribute rule
|
@@ -174,13 +174,14 @@ module Hpricot
|
|
174
174
|
# Scrubs the element according to the given config
|
175
175
|
# The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
|
176
176
|
# The rule value dictates what happens to the element. The following logic is used:
|
177
|
-
# If the
|
177
|
+
# If the rule is false/nil, the element is removed along with all it's children
|
178
178
|
# If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
|
179
|
+
# If the rule is a proc, the proc is called (and given the element itself) - the proc's expected to return a valid rule that matches this documentation
|
179
180
|
# Otherwise the element is kept
|
180
181
|
#
|
181
182
|
# If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
|
182
183
|
#
|
183
|
-
# After the above is done,
|
184
|
+
# After the above is done, if the element was kept, it's time to clean up its attributes so scrub_attributes is called. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
|
184
185
|
#
|
185
186
|
# This is recursive and will do all the above to all the children of the element as well.
|
186
187
|
#
|
@@ -194,6 +195,10 @@ module Hpricot
|
|
194
195
|
|
195
196
|
rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
|
196
197
|
|
198
|
+
while rule.is_a?(Proc)
|
199
|
+
rule = rule.call(self)
|
200
|
+
end
|
201
|
+
|
197
202
|
if !rule
|
198
203
|
remove
|
199
204
|
elsif rule == :strip
|
data/test/hpricot_scrub_test.rb
CHANGED
@@ -15,7 +15,9 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
15
15
|
"a" => {
|
16
16
|
"href" => %r|^https?://|i
|
17
17
|
},
|
18
|
-
"b" =>
|
18
|
+
"b" => Proc.new do |bold_element|
|
19
|
+
bold_element.parent.name == "p" ? true : :strip
|
20
|
+
end,
|
19
21
|
"body" => {
|
20
22
|
"lang" => %w(en es fr)
|
21
23
|
},
|
@@ -67,7 +69,6 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
67
69
|
def test_elem_rule_keep
|
68
70
|
@scrubbed_docs.each_with_index do |doc, i|
|
69
71
|
assert_equal @docs[i].search("//a").length, doc.search("//a").length
|
70
|
-
assert_equal @docs[i].search("//b").length, doc.search("//b").length
|
71
72
|
assert_equal @docs[i].search("//img").length, doc.search("//img").length
|
72
73
|
end
|
73
74
|
end
|
@@ -86,6 +87,12 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
86
87
|
end
|
87
88
|
end
|
88
89
|
|
90
|
+
def test_elem_rule_proc
|
91
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
92
|
+
assert_equal @docs[i].search("//p/b").length, doc.search("//b").length
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
89
96
|
def test_attr_default_rule_removes
|
90
97
|
@scrubbed_docs.each do |doc|
|
91
98
|
assert_equal 0, doc.search("*[@mce_src]").length
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot_scrub
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2008-04
|
6
|
+
version: 0.3.1
|
7
|
+
date: 2008-06-04 00:00:00 -05:00
|
8
8
|
summary: Scrub HTML with Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -64,7 +64,16 @@ dependencies:
|
|
64
64
|
version_requirement:
|
65
65
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
66
66
|
requirements:
|
67
|
-
- - "
|
67
|
+
- - ">"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version:
|
69
|
+
version: 0.0.0
|
70
|
+
version:
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: ">= 0.5"
|
73
|
+
version_requirement:
|
74
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">"
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: 0.0.0
|
70
79
|
version:
|