hpricot_scrub 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.txt +8 -0
- data/lib/hpricot_scrub/hpricot_scrub.rb +10 -5
- data/lib/hpricot_scrub/version.rb +1 -1
- data/test/hpricot_scrub_test.rb +9 -2
- metadata +13 -4
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
2008-06-03 Mina Naguib <mina.hpricotscrub@naguib.ca>
|
2
|
+
Release 0.3.1
|
3
|
+
- Allow an element rule to be a Proc (which is expected to return one of
|
4
|
+
the other non-proc rules (false/:strip/true/attr_rules). This allows the
|
5
|
+
building of highly custom filtering rules (for example filter out <B>
|
6
|
+
tags unless their parent is a <P>...)
|
7
|
+
- Slight cleanup of internal documentation
|
8
|
+
|
1
9
|
2008-01-11 Mina Naguib <mina.hpricotscrub@naguib.ca>
|
2
10
|
Release 0.3.0
|
3
11
|
Large overhaul of the module's logic to mimic most of perl's HTML::Scrubber
|
@@ -38,15 +38,15 @@ module Hpricot
|
|
38
38
|
#
|
39
39
|
# Merge+delete legacy config keys
|
40
40
|
#
|
41
|
-
# :remove_tags
|
41
|
+
# :remove_tags -> :elem_rules (false)
|
42
42
|
(config.delete(:remove_tags) || []).each do |tag|
|
43
43
|
config[:elem_rules][tag] = false unless config[:elem_rules].has_key?(tag)
|
44
44
|
end
|
45
|
-
# :allow_tags
|
45
|
+
# :allow_tags -> :elem_rules (true)
|
46
46
|
(config.delete(:allow_tags) || []).each do |tag|
|
47
47
|
config[:elem_rules][tag] = true unless config[:elem_rules].has_key?(tag)
|
48
48
|
end
|
49
|
-
# :allow_attributes
|
49
|
+
# :allow_attributes -> :default_attribute_rule (procs)
|
50
50
|
(config.delete(:allow_attributes) || []).each do |attribute|
|
51
51
|
#
|
52
52
|
# Add it to the default attribute rule
|
@@ -174,13 +174,14 @@ module Hpricot
|
|
174
174
|
# Scrubs the element according to the given config
|
175
175
|
# The relevant config key is :elem_rules. It is expected to be a Hash having String HTML tag names as keys, and a rule as values
|
176
176
|
# The rule value dictates what happens to the element. The following logic is used:
|
177
|
-
# If the
|
177
|
+
# If the rule is false/nil, the element is removed along with all it's children
|
178
178
|
# If the rule is :strip, the element is stripped (the element itself is deleted and its children are promoted upwards to where it was)
|
179
|
+
# If the rule is a proc, the proc is called (and given the element itself) - the proc's expected to return a valid rule that matches this documentation
|
179
180
|
# Otherwise the element is kept
|
180
181
|
#
|
181
182
|
# If the element name (HTML tag) was not found in :elem_rules, the default rule in config key :default_elem_rule is used
|
182
183
|
#
|
183
|
-
# After the above is done,
|
184
|
+
# After the above is done, if the element was kept, it's time to clean up its attributes so scrub_attributes is called. The rule is passed to it as it's assumed to be the attribute rules (see Hpricot::Scrub.keep_attribute?) to apply to the attributes, UNLESS the rule was explicitly "true", in which case the config key :default_attribute_rule is passed.
|
184
185
|
#
|
185
186
|
# This is recursive and will do all the above to all the children of the element as well.
|
186
187
|
#
|
@@ -194,6 +195,10 @@ module Hpricot
|
|
194
195
|
|
195
196
|
rule = config[:elem_rules].has_key?(name) ? config[:elem_rules][name] : config[:default_elem_rule]
|
196
197
|
|
198
|
+
while rule.is_a?(Proc)
|
199
|
+
rule = rule.call(self)
|
200
|
+
end
|
201
|
+
|
197
202
|
if !rule
|
198
203
|
remove
|
199
204
|
elsif rule == :strip
|
data/test/hpricot_scrub_test.rb
CHANGED
@@ -15,7 +15,9 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
15
15
|
"a" => {
|
16
16
|
"href" => %r|^https?://|i
|
17
17
|
},
|
18
|
-
"b" =>
|
18
|
+
"b" => Proc.new do |bold_element|
|
19
|
+
bold_element.parent.name == "p" ? true : :strip
|
20
|
+
end,
|
19
21
|
"body" => {
|
20
22
|
"lang" => %w(en es fr)
|
21
23
|
},
|
@@ -67,7 +69,6 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
67
69
|
def test_elem_rule_keep
|
68
70
|
@scrubbed_docs.each_with_index do |doc, i|
|
69
71
|
assert_equal @docs[i].search("//a").length, doc.search("//a").length
|
70
|
-
assert_equal @docs[i].search("//b").length, doc.search("//b").length
|
71
72
|
assert_equal @docs[i].search("//img").length, doc.search("//img").length
|
72
73
|
end
|
73
74
|
end
|
@@ -86,6 +87,12 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
86
87
|
end
|
87
88
|
end
|
88
89
|
|
90
|
+
def test_elem_rule_proc
|
91
|
+
@scrubbed_docs.each_with_index do |doc, i|
|
92
|
+
assert_equal @docs[i].search("//p/b").length, doc.search("//b").length
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
89
96
|
def test_attr_default_rule_removes
|
90
97
|
@scrubbed_docs.each do |doc|
|
91
98
|
assert_equal 0, doc.search("*[@mce_src]").length
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot_scrub
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2008-04
|
6
|
+
version: 0.3.1
|
7
|
+
date: 2008-06-04 00:00:00 -05:00
|
8
8
|
summary: Scrub HTML with Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -64,7 +64,16 @@ dependencies:
|
|
64
64
|
version_requirement:
|
65
65
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
66
66
|
requirements:
|
67
|
-
- - "
|
67
|
+
- - ">"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version:
|
69
|
+
version: 0.0.0
|
70
|
+
version:
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: ">= 0.5"
|
73
|
+
version_requirement:
|
74
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">"
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: 0.0.0
|
70
79
|
version:
|