htmlfilter 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc ADDED
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2009-06-25
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
data/Manifest.txt ADDED
@@ -0,0 +1,19 @@
1
+ #!mast bin lib meta test [A-Z]*
2
+ lib
3
+ lib/cssfilter.rb
4
+ lib/htmlfilter
5
+ lib/htmlfilter/multiton.rb
6
+ lib/htmlfilter.rb
7
+ meta
8
+ meta/package
9
+ meta/project
10
+ meta/title
11
+ meta/version
12
+ test
13
+ test/test_cssfilter.rb
14
+ test/test_htmlfilter.rb
15
+ Rakefile
16
+ Manifest.txt
17
+ TODO
18
+ README.rdoc
19
+ History.rdoc
data/README.rdoc ADDED
@@ -0,0 +1,53 @@
1
+ = HtmlFilter
2
+
3
+ * http://rubyworks.github.com/htmlfilter
4
+
5
+ == DESCRIPTION:
6
+
7
+ HTML Filter library can be used to sanitize and sterilize
8
+ HTML. A good idea if you let users submit HTML in comments,
9
+ for instance.
10
+
11
+ This library also include CssFilter. The CssFilter class will
12
+ clean-up a cascading style sheet. It can be used to remove
13
+ whitespace and most importantly remove urls.
14
+
15
+ == FEATURES:
16
+
17
+ * Santize HTML
18
+ * Compress CSS
19
+
20
+ == SYNOPSIS:
21
+
22
+ Via the class.
23
+
24
+ html = "<<b>hello</b>"
25
+
26
+ HtmlFilter.new(options).filter(html)
27
+
28
+ Or using the String extension.
29
+
30
+ html.html_filter #=> "<b>hello</b>"
31
+
32
+ See RDocs for more information.
33
+
34
+ == REQUIREMENTS:
35
+
36
+ * Uses a copy of multiton.rb (included)
37
+
38
+ == INSTALL:
39
+
40
+ * sudo gem install htmlfilter
41
+
42
+ == LICENSE:
43
+
44
+ (Creative Commons Attribution-ShareAlike License)
45
+
46
+ Copyright (c) 2009 Thomas Sawyer
47
+
48
+ See http://creativecommons.org/licenses/by-sa/3.0/deed.en
49
+
50
+ HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>.
51
+ This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License.
52
+ See http://creativecommons.org/licenses/by-sa/2.5/.
53
+
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ # -*- ruby -*-
2
+
3
+ #$: << './lib'
4
+ #require 'rubygems'
5
+ #require 'hoe'
6
+ #require 'htmlfilter'
7
+ #Hoe.new('htmlfilter', HtmlFilter::VERSION) do |p|
8
+ # p.rubyforge_name = 'death' # if different than lowercase project name
9
+ # p.developer('Thomas Sawyer', 'transfire@gmail.com')
10
+ #end
11
+
12
+
13
+
14
+ # vim: syntax=Ruby
15
+
data/TODO ADDED
@@ -0,0 +1,7 @@
1
+ = TODO List
2
+
3
+ * Maybe write executable(s) to use library via commandline.
4
+ * Elaborate on Features list in README.txt.
5
+ * Rename class to HTMLFilter (instead of HtmlFilter)
6
+
7
+
data/lib/cssfilter.rb ADDED
@@ -0,0 +1,226 @@
1
+ # = CSS Filter
2
+ #
3
+ # The CssFilter class will clean up a cascading style sheet.
4
+ # It can be used to remove whitespace and most importantly
5
+ # remove urls.
6
+ #
7
+ # == Authors
8
+ #
9
+ # * Trans
10
+ #
11
+ # == Todo
12
+ #
13
+ # * Allow urls to be specified per attribute type.
14
+ #
15
+ # == Copying
16
+ #
17
+ # Copyright (c) 2007 7rans
18
+
19
+ #require 'htmlfilter/uri'
20
+ require 'uri'
21
+
22
+ # = CSS Filter
23
+ #
24
+ # The CssFilter class will clean up a cascading style sheet.
25
+ # It can be used to remove whitespace and most importantly
26
+ # remove urls.
27
+ #
28
+ class CssFilter
29
+ VERSION="1.0.0"
30
+
31
+ # should we remove comments? (true, false)
32
+ attr_accessor :strip_comments
33
+
34
+ # should we remove urls? (true, false)
35
+ attr_accessor :strip_urls
36
+
37
+ # url schemes which will be allowed (http, ftp, mailto)
38
+ attr_accessor :allowed_scheme
39
+
40
+ # alias for allowed_scheme
41
+ alias_method :allowed_protocols, :allowed_scheme
42
+ alias_method :allowed_protocols=, :allowed_scheme=
43
+
44
+ # url hosts which will be allowed.
45
+ attr_accessor :allowed_hosts
46
+
47
+ # urls which will be allowed. (NOT YET USED)
48
+ attr_accessor :allowed_urls
49
+
50
+ # substitue urls (NOT YET USED)
51
+ attr_accessor :substitute_urls
52
+
53
+ # remove blank lines.
54
+ attr_accessor :strip_whitespace
55
+
56
+ # remove blank lines.
57
+ attr_accessor :strip_blanklines
58
+
59
+ # Complete parse and rewrite of CSS document.
60
+ # This does a complete "cleaning" but note that
61
+ # is not yet a perfect parser.
62
+ attr_accessor :rewrite
63
+
64
+ # CssFilter option defaults.
65
+
66
+ DEFAULT = {
67
+ 'strip_comments' => true,
68
+ 'strip_urls' => true,
69
+ 'allowed_urls' => [],
70
+ 'allowed_hosts' => [],
71
+ 'allowed_scheme' => [],
72
+ 'strip_whitespace' => false,
73
+ 'strip_blanklines' => true,
74
+ 'rewrite' => false,
75
+ 'substitute_urls' => {}
76
+ }
77
+
78
+ #
79
+
80
+ def initialize(options=nil)
81
+ if options
82
+ h = DEFAULT.dup
83
+ options.each do |k,v|
84
+ h[k.to_s] = v
85
+ end
86
+ options = h
87
+ else
88
+ options = DEFAULT.dup
89
+ end
90
+
91
+ options.each{ |k,v| send("#{k}=",v) }
92
+ end
93
+
94
+ #
95
+
96
+ def accept_host(host)
97
+ @hosts << host
98
+ end
99
+
100
+ #
101
+
102
+ def filter(css)
103
+ css = remove_comments(css) if strip_comments
104
+ css = remove_urls(css) if strip_urls
105
+
106
+ css = remove_nullvalues(css)
107
+
108
+ css = remove_whitespace(css) if strip_whitespace
109
+ css = remove_blanklines(css) if strip_blanklines
110
+
111
+ css = parse(css).to_css if rewrite
112
+ css
113
+ end
114
+
115
+ #
116
+
117
+ def remove_comments(data)
118
+ data.gsub(/\/\*(.8?)\*\//,'')
119
+ end
120
+
121
+ # TODO: allowed_urls
122
+
123
+ def remove_urls(data)
124
+ urls = data.scan(/url\((.*?)\)/).flatten
125
+ uris = urls.collect{ |u| URI.extract(u) }.flatten
126
+ uris.each do |u|
127
+ uri = URI.parse(u)
128
+ unless allowed_hosts.include?(uri.host) or
129
+ allowed_scheme.include?(uri.scheme)
130
+ data.sub!(u.to_s, '')
131
+ end
132
+ end
133
+ data.gsub(/url\(\s*\)/, '')
134
+ end
135
+
136
+ #
137
+
138
+ def remove_whitespace(data)
139
+ data = data.gsub(/^\s*/,'')
140
+ data = data.gsub(/\s*$/,'')
141
+ end
142
+
143
+ #
144
+
145
+ def remove_blanklines(data)
146
+ data = data.gsub(/^\s*\n/,'')
147
+ end
148
+
149
+ #
150
+
151
+ def remove_nullvalues(data);
152
+ data = data.gsub(/\w+[:](\s+)[;]/,'')
153
+ end
154
+
155
+ # Breaks a css document up into a hash. This can be used
156
+ # completely rewritting the css.
157
+ #
158
+ # TODO: Not complete, does not work with "@xxx foo;" for example.
159
+
160
+ def parse(css)
161
+ tree = CssTree.new
162
+ entries = css.scan(/^(.*?)\{(.*?)\}/m)
163
+ entries.each do |ref, props|
164
+ tree[ref.strip] ||= {}
165
+ props = clean_properties(props)
166
+ props = props.scan(/(.*?)[:](.*?)([;]|\s*\Z)/)
167
+ props.each do |(key,val)|
168
+ tree[ref.strip][key.strip] = clean_value(val)
169
+ end
170
+ end
171
+ return tree
172
+ end
173
+
174
+ # Takes a css entry and ensures it is valid (as best it can).
175
+ # It will fix trival mistakes, and raise an error when it is
176
+ # beyond repair.
177
+ #
178
+ # TODO: So far this does absolutely nothing!
179
+
180
+ def clean_properties(atts)
181
+ atts
182
+ end
183
+
184
+ #
185
+
186
+ def clean_value(val)
187
+ val = val.strip
188
+
189
+ if urls
190
+ uris = URI.extract(val)
191
+ uris.each do |u|
192
+ val.sub!(u.to_s, urls)
193
+ end
194
+ end
195
+
196
+ return val
197
+ end
198
+
199
+ end
200
+
201
+
202
+ # CSS parse tree. This is for a "deep filtering".
203
+
204
+ class CssTree < Hash
205
+
206
+ def initialize(options=nil)
207
+ @options = options || {}
208
+ super()
209
+ end
210
+
211
+ # Re-output the CSS, all tidy ;)
212
+
213
+ def to_css
214
+ css = ""
215
+ each do |selector, entries|
216
+ css << "#{selector}{"
217
+ entries.each do |key, value|
218
+ css << "#{key}:#{value};"
219
+ end
220
+ css << "}\n"
221
+ end
222
+ return css
223
+ end
224
+
225
+ end
226
+
@@ -0,0 +1,386 @@
1
+ # = Multiton
2
+ #
3
+ # == Synopsis
4
+ #
5
+ # Multiton design pattern ensures only one object is allocated for a given state.
6
+ #
7
+ # The 'multiton' pattern is similar to a singleton, but instead of only one
8
+ # instance, there are several similar instances. It is useful when you want to
9
+ # avoid constructing objects many times because of some huge expense (connecting
10
+ # to a database for example), require a set of similar but not identical
11
+ # objects, and cannot easily control how many times a contructor may be called.
12
+ #
13
+ # class SomeMultitonClass
14
+ # include Multiton
15
+ # attr :arg
16
+ # def initialize(arg)
17
+ # @arg = arg
18
+ # end
19
+ # end
20
+ #
21
+ # a = SomeMultitonClass.new(4)
22
+ # b = SomeMultitonClass.new(4) # a and b are same object
23
+ # c = SomeMultitonClass.new(2) # c is a different object
24
+ #
25
+ # == Previous Behavior
26
+ #
27
+ # In previous versions of Multiton the #new method was made
28
+ # private and #instance had to be used in its stay --just like Singleton.
29
+ # But this is less desirable for Multiton since Multitions can
30
+ # have multiple instances, not just one.
31
+ #
32
+ # So instead Multiton now defines #create as a private alias of
33
+ # the original #new method (just in case it is needed) and then
34
+ # defines #new to handle the multiton; #instance is provided
35
+ # as an alias for it.
36
+ #
37
+ #--
38
+ # So if you must have the old behavior, all you need do is re-alias
39
+ # #new to #create and privatize it.
40
+ #
41
+ # class SomeMultitonClass
42
+ # include Multiton
43
+ # alias_method :new, :create
44
+ # private :new
45
+ # ...
46
+ # end
47
+ #
48
+ # Then only #instance will be available for creating the Multiton.
49
+ #++
50
+ #
51
+ # == How It Works
52
+ #
53
+ # A pool of objects is searched for a previously cached object,
54
+ # if one is not found we construct one and cache it in the pool
55
+ # based on class and the args given to the contructor.
56
+ #
57
+ # A limitation of this approach is that it is impossible to
58
+ # detect if different blocks were given to a contructor (if it takes a
59
+ # block). So it is the constructor arguments _only_ which determine
60
+ # the uniqueness of an object. To workaround this, define the _class_
61
+ # method ::multiton_id.
62
+ #
63
+ # def Klass.multiton_id(*args, &block)
64
+ # # ...
65
+ # end
66
+ #
67
+ # Which should return a hash key used to identify the object being
68
+ # constructed as (not) unique.
69
+ #
70
+ # == Authors
71
+ #
72
+ # * Christoph Rippel
73
+ # * Thomas Sawyer
74
+ #
75
+ # = Copying
76
+ #
77
+ # Copyright (c) 2007 Christoph Rippel, Thomas Sawyer
78
+ #
79
+ # Ruby License
80
+ #
81
+ # This module is free software. You may use, modify, and/or redistribute this
82
+ # software under the same terms as Ruby.
83
+ #
84
+ # This program is distributed in the hope that it will be useful, but WITHOUT
85
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
86
+ # FOR A PARTICULAR PURPOSE.
87
+
88
+ require 'thread'
89
+
90
+ # = Multiton
91
+ #
92
+ # Multiton design pattern ensures only one object is allocated for a given state.
93
+ #
94
+ # The 'multiton' pattern is similar to a singleton, but instead of only one
95
+ # instance, there are several similar instances. It is useful when you want to
96
+ # avoid constructing objects many times because of some huge expense (connecting
97
+ # to a database for example), require a set of similar but not identical
98
+ # objects, and cannot easily control how many times a contructor may be called.
99
+ #
100
+ # class SomeMultitonClass
101
+ # include Multiton
102
+ # attr :arg
103
+ # def initialize(arg)
104
+ # @arg = arg
105
+ # end
106
+ # end
107
+ #
108
+ # a = SomeMultitonClass.new(4)
109
+ # b = SomeMultitonClass.new(4) # a and b are same object
110
+ # c = SomeMultitonClass.new(2) # c is a different object
111
+ #
112
+ # == How It Works
113
+ #
114
+ # A pool of objects is searched for a previously cached object,
115
+ # if one is not found we construct one and cache it in the pool
116
+ # based on class and the args given to the contructor.
117
+ #
118
+ # A limitation of this approach is that it is impossible to
119
+ # detect if different blocks were given to a contructor (if it takes a
120
+ # block). So it is the constructor arguments _only_ which determine
121
+ # the uniqueness of an object. To workaround this, define the _class_
122
+ # method ::multiton_id.
123
+ #
124
+ # def Klass.multiton_id(*args, &block)
125
+ # # ...
126
+ # end
127
+ #
128
+ # Which should return a hash key used to identify the object being
129
+ # constructed as (not) unique.
130
+
131
+ module Multiton
132
+
133
+ # disable build-in copying methods
134
+
135
+ def clone
136
+ raise TypeError, "can't clone Multiton #{self}"
137
+ #self
138
+ end
139
+
140
+ def dup
141
+ raise TypeError, "can't dup Multiton #{self}"
142
+ #self
143
+ end
144
+
145
+ # default marshalling strategy
146
+
147
+ protected
148
+
149
+ def _dump(depth=-1)
150
+ Marshal.dump(@multiton_initializer)
151
+ end
152
+
153
+ # Mutex to safely store multiton instances.
154
+
155
+ class InstanceMutex < Hash #:nodoc:
156
+ def initialize
157
+ @global = Mutex.new
158
+ end
159
+
160
+ def initialized(arg)
161
+ store(arg, DummyMutex)
162
+ end
163
+
164
+ def (DummyMutex = Object.new).synchronize
165
+ yield
166
+ end
167
+
168
+ def default(arg)
169
+ @global.synchronize{ fetch(arg){ store(arg, Mutex.new) } }
170
+ end
171
+ end
172
+
173
+ # Multiton can be included in another module, in which case that module effectively becomes
174
+ # a multiton behavior distributor too. This is why we propogate #included to the base module.
175
+ # by putting it in another module.
176
+ #
177
+ #--
178
+ # def append_features(mod)
179
+ # # help out people counting on transitive mixins
180
+ # unless mod.instance_of?(Class)
181
+ # raise TypeError, "Inclusion of Multiton in module #{mod}"
182
+ # end
183
+ # super
184
+ # end
185
+ #++
186
+
187
+ module Inclusive
188
+ private
189
+ def included(base)
190
+ class << base
191
+ #alias_method(:new!, :new) unless method_defined?(:new!)
192
+ # gracefully handle multiple inclusions of Multiton
193
+ unless include?(Multiton::MetaMethods)
194
+ alias_method :new!, :new
195
+ private :allocate #, :new
196
+ include Multiton::MetaMethods
197
+
198
+ if method_defined?(:marshal_dump)
199
+ undef_method :marshal_dump
200
+ warn "warning: marshal_dump was undefined since it is incompatible with the Multiton pattern"
201
+ end
202
+ end
203
+ end
204
+ end
205
+ end
206
+
207
+ extend Inclusive
208
+
209
+ #
210
+
211
+ module MetaMethods
212
+
213
+ include Inclusive
214
+
215
+ def instance(*e, &b)
216
+ arg = multiton_id(*e, &b)
217
+ multiton_instance.fetch(arg) do
218
+ multiton_mutex[arg].synchronize do
219
+ multiton_instance.fetch(arg) do
220
+ val = multiton_instance[arg] = new!(*e, &b) #new(*e, &b)
221
+ val.instance_variable_set(:@multiton_initializer, e, &b)
222
+ multiton_mutex.initialized(arg)
223
+ val
224
+ end
225
+ end
226
+ end
227
+ end
228
+ alias_method :new, :instance
229
+
230
+ def initialized?(*e, &b)
231
+ multiton_instance.key?(multiton_id(*e, &b))
232
+ end
233
+
234
+ protected
235
+
236
+ def multiton_instance
237
+ @multiton_instance ||= Hash.new
238
+ end
239
+
240
+ def multiton_mutex
241
+ @multiton_mutex ||= InstanceMutex.new
242
+ end
243
+
244
+ def reinitialize
245
+ multiton_instance.clear
246
+ multiton_mutex.clear
247
+ end
248
+
249
+ def _load(str)
250
+ instance(*Marshal.load(str))
251
+ end
252
+
253
+ private
254
+
255
+ # Default method to to create a key to cache already constructed
256
+ # instances. In the use case MultitonClass.new(e), MultiClass.new(f)
257
+ # must be semantically equal if multiton_id(e).eql?(multiton_id(f))
258
+ # evaluates to true.
259
+ def multiton_id(*e, &b)
260
+ e
261
+ end
262
+
263
+ def singleton_method_added(sym)
264
+ super
265
+ if (sym == :marshal_dump) & singleton_methods.include?('marshal_dump')
266
+ raise TypeError, "Don't use marshal_dump - rely on _dump and _load instead"
267
+ end
268
+ end
269
+
270
+ end
271
+
272
+ end
273
+
274
+
275
+
276
+
277
+ =begin
278
+ # TODO Convert this into a real test and/or benchmark.
279
+
280
+ if $0 == __FILE__
281
+
282
+ ### Simple marshalling test #######
283
+ class A
284
+ def initialize(a,*e)
285
+ @e = a
286
+ end
287
+
288
+ include Multiton
289
+ begin
290
+ def self.marshal_dump(depth = -1)
291
+ end
292
+ rescue => mes
293
+ p mes
294
+ class << self; undef marshal_dump end
295
+ end
296
+ end
297
+
298
+ C = Class.new(A.clone)
299
+ s = C.instance('a','b')
300
+
301
+ raise unless Marshal.load(Marshal.dump(s)) == s
302
+
303
+
304
+ ### Interdependent initialization example and threading benchmark ###
305
+
306
+ class Regular_SymPlane
307
+ def self.multiton_id(*e)
308
+ a,b = e
309
+ (a+b - 1)*(a+b )/2 + (a > b ? a : b)
310
+ end
311
+
312
+ def initialize(a,b)
313
+ klass = self.class
314
+ if a < b
315
+ @l = b > 0 ? klass.instance(a,b-1) : nil
316
+ @r = a > 0 ? klass.instance(a-1,b) : nil
317
+ else
318
+ @l = a > 0 ? klass.instance(a-1,b) : nil
319
+ @r = b > 0 ? klass.instance(a,b-1) : nil
320
+ end
321
+ end
322
+
323
+ include Multiton
324
+ end
325
+
326
+
327
+
328
+ def nap
329
+ # Thread.pass
330
+ sleep(rand(0.01))
331
+ end
332
+
333
+ class SymPlane < Regular_SymPlane
334
+ @m = Mutex.new
335
+ @count = 0
336
+ end
337
+
338
+ class << SymPlane
339
+ attr_reader :count
340
+ def reinitialize
341
+ super
342
+ @m = Mutex.new
343
+ @count = 0
344
+ end
345
+ def inherited(sub_class)
346
+ super
347
+ sub_class.instance_eval { @m = Mutex.new; @count = 0 }
348
+ end
349
+
350
+ def multiton_id(*e)
351
+ nap()
352
+ super
353
+ end
354
+
355
+ def new!(*e) # NOTICE!!!
356
+ super
357
+ ensure
358
+ nap()
359
+ @m.synchronize { p @count if (@count += 1) % 15 == 0 }
360
+ end
361
+
362
+ def run(k)
363
+ threads = 0
364
+ max = k * (k+1) / 2
365
+ puts ""
366
+ while count() < max
367
+ Thread.new { threads+= 1; instance(rand(30),rand(30)) }
368
+ end
369
+ puts "\nThe simulation created #{threads} threads"
370
+ end
371
+ end
372
+
373
+
374
+ require 'benchmark'
375
+ include Benchmark
376
+
377
+ bmbm do |x|
378
+ x.report('Initialize 465 SymPlane instances') { SymPlane.run(30) }
379
+ x.report('Reinitialize ') do
380
+ sleep 3
381
+ SymPlane.reinitialize
382
+ end
383
+ end
384
+
385
+ end
386
+ =end
data/lib/htmlfilter.rb ADDED
@@ -0,0 +1,516 @@
1
+ # = HTML Filter
2
+ #
3
+ # HTML Filter library can be used to sanitize and sterilize
4
+ # HTML. A good idea if you let users submit HTML in comments,
5
+ # for instance.
6
+ #
7
+ # HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>
8
+ #
9
+ # This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
10
+ # http://creativecommons.org/licenses/by-sa/2.5/
11
+ #
12
+ # Thanks to Jang Kim for adding support for single quoted attributes.
13
+ #
14
+ # == Reference
15
+ #
16
+ # * http://iamcal.com/publish/articles/php/processing_html/
17
+ # * http://iamcal.com/publish/articles/php/processing_html_part_2/
18
+ #
19
+ # == Author(s)
20
+ #
21
+ # * Trans
22
+ # * George Moschovitis
23
+ # * James Britt
24
+ # * Cal Henderson
25
+ # * Jang Kim
26
+ #
27
+ # == Copying
28
+ #
29
+ # Copyright (c) 2007 Trans
30
+
31
+ require 'htmlfilter/multiton.rb'
32
+
33
+ # = HtmlFilter
34
+ #
35
+ # HTML Filter library can be used to sanitize and sterilize
36
+ # HTML. A good idea if you let users submit HTML in comments,
37
+ # for instance.
38
+ #
39
+ # lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>
40
+ #
41
+ # This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
42
+ # http://creativecommons.org/licenses/by-sa/2.5/
43
+ #
44
+ # Thanks to Jang Kim for adding support for single quoted attributes.
45
+ #
46
+ # == Reference
47
+ #
48
+ # * http://iamcal.com/publish/articles/php/processing_html/
49
+ # * http://iamcal.com/publish/articles/php/processing_html_part_2/
50
+
51
+ class HtmlFilter
52
+ VERSION = "1.0.0"
53
+
54
+ include Multiton
55
+
56
+ # tags and attributes that are allowed
57
+ #
58
+ # Eg.
59
+ #
60
+ # {
61
+ # 'a' => ['href', 'target'],
62
+ # 'b' => [],
63
+ # 'img' => ['src', 'width', 'height', 'alt']
64
+ # }
65
+ attr_accessor :allowed
66
+
67
+ # tags which should always be self-closing (e.g. "<img />")
68
+ attr_accessor :no_close
69
+
70
+ # tags which must always have seperate opening and closing
71
+ # tags (e.g. "<b></b>")
72
+ attr_accessor :always_close
73
+
74
+ # attributes which should be checked for valid protocols
75
+ # (src,href)
76
+ attr_accessor :protocol_attributes
77
+
78
+ # protocols which are allowed (http, ftp, mailto)
79
+ attr_accessor :allowed_protocols
80
+
81
+ # tags which should be removed if they contain no content
82
+ # (e.g. "<b></b>" or "<b />")
83
+ attr_accessor :remove_blanks
84
+
85
+ # should we remove comments? (true, false)
86
+ attr_accessor :strip_comments
87
+
88
+ # should we try and make a b tag out of "b>" (true, false)
89
+ attr_accessor :always_make_tags
90
+
91
+ # entity control option (true, false)
92
+ attr_accessor :allow_numbered_entities
93
+
94
+ # entity control option (amp, gt, lt, quot, etc.)
95
+ attr_accessor :allowed_entities
96
+
97
+ # default settings
98
+
99
+ DEFAULT = {
100
+ 'allowed' => {
101
+ 'a' => ['href', 'target'],
102
+ 'b' => [],
103
+ 'i' => [],
104
+ 'img' => ['src', 'width', 'height', 'alt']
105
+ },
106
+ 'no_close' => ['img', 'br', 'hr'],
107
+ 'always_close' => ['a', 'b'],
108
+ 'protocol_attributes' => ['src', 'href'],
109
+ 'allowed_protocols' => ['http', 'ftp', 'mailto'],
110
+ 'remove_blanks' => ['a', 'b'],
111
+ 'strip_comments' => true,
112
+ 'always_make_tags' => true,
113
+ 'allow_numbered_entities' => true,
114
+ 'allowed_entities' => ['amp', 'gt', 'lt', 'quot']
115
+ }
116
+
117
+ # New html filter.
118
+
119
+ def initialize( options=nil )
120
+ if options
121
+ h = DEFAULT.dup
122
+ options.each do |k,v|
123
+ h[k.to_s] = v
124
+ end
125
+ options = h
126
+ else
127
+ options = DEFAULT.dup
128
+ end
129
+
130
+ options.each{ |k,v| send("#{k}=",v) }
131
+ end
132
+
133
+ # Filter html string.
134
+
135
+ def filter(data)
136
+ @tag_counts = {}
137
+
138
+ data = escape_comments(data)
139
+ data = balance_html(data)
140
+ data = check_tags(data)
141
+ data = process_remove_blanks(data)
142
+ data = validate_entities(data)
143
+
144
+ return data
145
+ end
146
+
147
+ private
148
+
149
+ #
150
+ # internal tag counter
151
+ #
152
+
153
+ def tag_counts ; @tag_counts; end
154
+
155
+ #
156
+ #
157
+ #
158
+
159
+ def escape_comments(data)
160
+ data = data.gsub(/<!--(.*?)-->/s) do
161
+ '<!--' + escape_special_chars(strip_single($1)) + '-->'
162
+ end
163
+
164
+ return data
165
+ end
166
+
167
+ #
168
+ #
169
+ #
170
+
171
+ def balance_html(data)
172
+ data = data.dup
173
+
174
+ if always_make_tags
175
+ # try and form html
176
+ data.gsub!(/>>+/, '>')
177
+ data.gsub!(/<<+/, '<')
178
+ data.gsub!(/^>/, '')
179
+ data.gsub!(/<([^>]*?)(?=<|$)/, '<\1>')
180
+ data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1<\2')
181
+ else
182
+ # escape stray brackets
183
+ data.gsub!(/<([^>]*?)(?=<|$)/, '&lt;\1')
184
+ data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1\2&gt;<')
185
+ # the last regexp causes '<>' entities to appear
186
+ # (we need to do a lookahead assertion so that the last bracket
187
+ # can be used in the next pass of the regexp)
188
+ data.gsub!('<>', '')
189
+ end
190
+
191
+ return data
192
+ end
193
+
194
+ #
195
+ #
196
+ #
197
+
198
+ def check_tags(data)
199
+ data = data.dup
200
+
201
+ data.gsub!(/<(.*?)>/s){
202
+ process_tag(strip_single($1))
203
+ }
204
+
205
+ tag_counts.each do |tag, cnt|
206
+ cnt.times{ data << "</#{tag}>" }
207
+ end
208
+
209
+ return data
210
+ end
211
+
212
+ #
213
+ #
214
+ #
215
+
216
+ def process_tag(data)
217
+
218
+ # ending tags
219
+
220
+ re = /^\/([a-z0-9]+)/si
221
+
222
+ if matches = re.match(data)
223
+ name = matches[1].downcase
224
+ if allowed.key?(name)
225
+ unless no_close.include?(name)
226
+ if tag_counts[name]
227
+ tag_counts[name] -= 1
228
+ return "</#{name}>"
229
+ end
230
+ end
231
+ else
232
+ return ''
233
+ end
234
+ end
235
+
236
+ # starting tags
237
+
238
+ re = /^([a-z0-9]+)(.*?)(\/?)$/si
239
+
240
+ if matches = re.match(data)
241
+ name = matches[1].downcase
242
+ body = matches[2]
243
+ ending = matches[3]
244
+
245
+ if allowed.key?(name)
246
+ params = ""
247
+
248
+ matches_2 = body.scan(/([a-z0-9]+)=(["'])(.*?)\2/si) # <foo a="b" />
249
+ matches_1 = body.scan(/([a-z0-9]+)(=)([^"\s']+)/si) # <foo a=b />
250
+ matches_3 = body.scan(/([a-z0-9]+)=(["'])([^"']*?)\s*$/si) # <foo a="b />
251
+
252
+ matches = matches_1 + matches_2 + matches_3
253
+
254
+ matches.each do |match|
255
+ pname = match[0].downcase
256
+ if allowed[name].include?(pname)
257
+ value = match[2]
258
+ if protocol_attributes.include?(pname)
259
+ value = process_param_protocol(value)
260
+ end
261
+ params += %{ #{pname}="#{value}"}
262
+ end
263
+ end
264
+ if no_close.include?(name)
265
+ ending = ' /'
266
+ end
267
+ if always_close.include?(name)
268
+ ending = ''
269
+ end
270
+ if ending.empty?
271
+ if tag_counts.key?(name)
272
+ tag_counts[name] += 1
273
+ else
274
+ tag_counts[name] = 1
275
+ end
276
+ end
277
+ unless ending.empty?
278
+ ending = ' /'
279
+ end
280
+ return '<' + name + params + ending + '>'
281
+ else
282
+ return ''
283
+ end
284
+ end
285
+
286
+ # comments
287
+ if /^!--(.*)--$/si =~ data
288
+ if strip_comments
289
+ return ''
290
+ else
291
+ return '<' + data + '>'
292
+ end
293
+ end
294
+
295
+ # garbage, ignore it
296
+ return ''
297
+ end
298
+
299
+ #
300
+ #
301
+ #
302
+
303
+ def process_param_protocol(data)
304
+ data = decode_entities(data)
305
+
306
+ re = /^([^:]+)\:/si
307
+
308
+ if matches = re.match(data)
309
+ unless allowed_protocols.include?(matches[1])
310
+ #data = '#'.substr(data, strlen(matches[1])+1)
311
+ data = '#' + data[0..matches[1].size+1]
312
+ end
313
+ end
314
+
315
+ return data
316
+ end
317
+
318
+ #
319
+ #
320
+ #
321
+
322
+ def process_remove_blanks(data)
323
+ data = data.dup
324
+
325
+ remove_blanks.each do |tag|
326
+ data.gsub!(/<#{tag}(\s[^>]*)?><\/#{tag}>/, '')
327
+ data.gsub!(/<#{tag}(\s[^>]*)?\/>/, '')
328
+ end
329
+
330
+ return data
331
+ end
332
+
333
+ #
334
+ #
335
+ #
336
+
337
+ def fix_case(data)
338
+ data_notags = strip_tags(data)
339
+ data_notags = data_notags.gsub(/[^a-zA-Z]/, '')
340
+
341
+ if data_notags.size < 5
342
+ return data
343
+ end
344
+
345
+ if /[a-z]/ =~ data_notags
346
+ return data
347
+ end
348
+
349
+ data = data.gsub(/(>|^)([^<]+?)(<|$)/s){
350
+ strip_single($1) +
351
+ fix_case_inner(strip_single($2)) +
352
+ strip_single($3)
353
+ }
354
+
355
+ return data
356
+ end
357
+
358
+ #
359
+ #
360
+ #
361
+
362
+ def fix_case_inner(data)
363
+ data = data.dup
364
+
365
+ data.downcase!
366
+
367
+ data.gsub!(/(^|[^\w\s\';,\\-])(\s*)([a-z])/){
368
+ strip_single("#{$1}#{$2}") + strip_single($3).upcase
369
+ }
370
+
371
+ return data
372
+ end
373
+
374
+ #
375
+ #
376
+ #
377
+
378
+ def validate_entities(data)
379
+ data = data.dup
380
+
381
+ # validate entities throughout the string
382
+ data.gsub!(%r!&([^&;]*)(?=(;|&|$))!){
383
+ check_entity(strip_single($1), strip_single($2))
384
+ }
385
+
386
+ # validate quotes outside of tags
387
+ data.gsub!(/(>|^)([^<]+?)(<|$)/s){
388
+ m1, m2, m3 = $1, $2, $3
389
+ strip_single(m1) +
390
+ strip_single(m2).gsub('\"', '&quot;') +
391
+ strip_single(m3)
392
+ }
393
+
394
+ return data
395
+ end
396
+
397
+ #
398
+ #
399
+ #
400
+
401
+ def check_entity(preamble, term)
402
+ if term != ';'
403
+ return '&amp;' + preamble
404
+ end
405
+
406
+ if is_valid_entity(preamble)
407
+ return '&' + preamble
408
+ end
409
+
410
+ return '&amp;' + preamble
411
+ end
412
+
413
+ #
414
+ #
415
+ #
416
+
417
+ def is_valid_entity(entity)
418
+ re = /^#([0-9]+)$/i
419
+
420
+ if md = re.match(entity)
421
+ if (md[1].to_i > 127)
422
+ return true
423
+ end
424
+ return allow_numbered_entities
425
+ end
426
+
427
+ if allowed_entities.include?(entity)
428
+ return true
429
+ end
430
+
431
+ return nil
432
+ end
433
+
434
+ # within attributes, we want to convert all hex/dec/url
435
+ # escape sequences into their raw characters so that we can
436
+ # check we don't get stray quotes/brackets inside strings.
437
+
438
+ def decode_entities(data)
439
+ data = data.dup
440
+
441
+ data.gsub!(/(&)#(\d+);?/){ decode_dec_entity($1, $2) }
442
+ data.gsub!(/(&)#x([0-9a-f]+);?/i){ decode_hex_entity($1, $2) }
443
+ data.gsub!(/(%)([0-9a-f]{2});?/i){ decode_hex_entity($1, $2) }
444
+
445
+ data = validate_entities(data)
446
+
447
+ return data
448
+ end
449
+
450
+ #
451
+ #
452
+ #
453
+
454
+ def decode_hex_entity(*m)
455
+ return decode_num_entity(m[1], m[2].to_i.to_s(16))
456
+ end
457
+
458
+ #
459
+ #
460
+ #
461
+
462
+ def decode_dec_entity(*m)
463
+ return decode_num_entity(m[1], m[2])
464
+ end
465
+
466
+ #
467
+ #
468
+ #
469
+
470
+ def decode_num_entity(orig_type, d)
471
+ d = d.to_i
472
+ d = 32 if d < 0 # space
473
+
474
+ # don't mess with high chars
475
+ if d > 127
476
+ return '%' + d.to_s(16) if orig_type == '%'
477
+ return "&#{d};" if orig_type == '&'
478
+ end
479
+
480
+ return escape_special_chars(d.chr)
481
+ end
482
+
483
+ #
484
+ #
485
+ #
486
+
487
+ def strip_single(data)
488
+ return data.gsub('\"', '"').gsub('\0', 0.chr)
489
+ end
490
+
491
+ # Certain characters have special significance in HTML, and
492
+ # should be represented by HTML entities if they are to
493
+ # preserve their meanings. This function returns a string
494
+ # with some of these conversions made; the translations made
495
+ # are those most useful for everyday web programming.
496
+
497
+ def escape_special_chars(data)
498
+ data = data.dup
499
+ data.gsub!( /&/n , '&amp;' )
500
+ data.gsub!( /\"/n , '&quot;' )
501
+ data.gsub!( />/n , '&gt;' )
502
+ data.gsub!( /</n , '&lt;' )
503
+ data.gsub!( /'/ , '&#039;' )
504
+ return data
505
+ end
506
+
507
+ end
508
+
509
+ # Overload the standard String class for extra convienience.
510
+
511
+ class String
512
+ def html_filter(*opts)
513
+ HtmlFilter.new(*opts).filter(self)
514
+ end
515
+ end
516
+
data/meta/package ADDED
@@ -0,0 +1 @@
1
+ htmlfilter
data/meta/project ADDED
@@ -0,0 +1 @@
1
+ rubyworks
data/meta/title ADDED
@@ -0,0 +1 @@
1
+ HTMLFilter
data/meta/version ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,35 @@
1
+ require "test/unit"
2
+ require "cssfilter"
3
+ #require 'yaml'
4
+
5
+ class TestCssFilter < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @css = <<-END
9
+ * {
10
+ margin: 0;
11
+ height: 0;
12
+ }
13
+
14
+ body {
15
+ margin: 0;
16
+ height: 0;
17
+ background: url(http://xzy.org);
18
+ }
19
+
20
+ h1 {
21
+ trythis: url(http://here.org/fun.js);
22
+ font-size: 12pt;
23
+ }
24
+ END
25
+ @result = "* {\nmargin: 0;\nheight: 0;\n}\nbody {\nmargin: 0;\nheight: 0;\n}\nh1 {\ntrythis: url(http://here.org/fun.js);\nfont-size: 12pt;\n}"
26
+ end
27
+
28
+ def test_filter
29
+ cssfilter = CssFilter.new(:allowed_hosts=>["here.org"], :strip_whitespace => true)
30
+ csstree = cssfilter.filter(@css)
31
+ assert_equal(@result, csstree.to_s)
32
+ end
33
+
34
+ end
35
+
@@ -0,0 +1,70 @@
1
+ require "test/unit"
2
+ require "htmlfilter"
3
+
4
+ class TestHtmlFilter < Test::Unit::TestCase
5
+
6
+ # core tests
7
+
8
+ def test_multiton_without_options
9
+ h1 = HtmlFilter.new
10
+ h2 = HtmlFilter.new
11
+ h3 = HtmlFilter.new( :strip_comments => false )
12
+ assert_equal( h1.object_id, h2.object_id )
13
+ assert_not_equal( h1.object_id, h3.object_id )
14
+ end
15
+
16
+ def test_multiton_with_options
17
+ h1 = HtmlFilter.new( :strip_comments => false )
18
+ h2 = HtmlFilter.new( :strip_comments => false )
19
+ h3 = HtmlFilter.new
20
+ assert_equal( h1.object_id, h2.object_id )
21
+ assert_not_equal( h1.object_id, h3.object_id )
22
+ end
23
+
24
+ def test_strip_single
25
+ hf = HtmlFilter.new
26
+ assert_equal( '"', hf.send(:strip_single,'\"') )
27
+ assert_equal( "\000", hf.send(:strip_single,'\0') )
28
+ end
29
+
30
+ # functional tests
31
+
32
+ def assert_filter(filtered, original)
33
+ assert_equal(filtered, original.html_filter)
34
+ end
35
+
36
+ def test_fix_quotes
37
+ assert_filter '<img src="foo.jpg" />', "<img src=\"foo.jpg />"
38
+ end
39
+
40
+ def test_basics
41
+ assert_filter '', ''
42
+ assert_filter 'hello', 'hello'
43
+ end
44
+
45
+ def test_balancing_tags
46
+ assert_filter "<b>hello</b>", "<<b>hello</b>"
47
+ assert_filter "<b>hello</b>", "<b>>hello</b>"
48
+ assert_filter "<b>hello</b>", "<b>hello<</b>"
49
+ assert_filter "<b>hello</b>", "<b>hello</b>>"
50
+ assert_filter "", "<>"
51
+ end
52
+
53
+ def test_tag_completion
54
+ assert_filter "hello", "hello<b>"
55
+ assert_filter "<b>hello</b>", "<b>hello"
56
+ assert_filter "hello<b>world</b>", "hello<b>world"
57
+ assert_filter "hello", "hello</b>"
58
+ assert_filter "hello", "hello<b/>"
59
+ assert_filter "hello<b>world</b>", "hello<b/>world"
60
+ assert_filter "<b><b><b>hello</b></b></b>", "<b><b><b>hello"
61
+ assert_filter "", "</b><b>"
62
+ end
63
+
64
+ def test_end_slashes
65
+ assert_filter '<img />', '<img>'
66
+ assert_filter '<img />', '<img/>'
67
+ assert_filter '', '<b/></b>'
68
+ end
69
+
70
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: htmlfilter
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors: []
7
+
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-22 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: HTML Filter library can be used to sanitize and sterilize HTML. A good idea if you let users submit HTML in comments, for instance. This library also include CssFilter. The CssFilter class will clean-up a cascading style sheet. It can be used to remove whitespace and most importantly remove urls.
17
+ email:
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - Rakefile
24
+ - Manifest.txt
25
+ - TODO
26
+ - README.rdoc
27
+ - History.rdoc
28
+ files:
29
+ - lib/cssfilter.rb
30
+ - lib/htmlfilter/multiton.rb
31
+ - lib/htmlfilter.rb
32
+ - meta/package
33
+ - meta/project
34
+ - meta/title
35
+ - meta/version
36
+ - test/test_cssfilter.rb
37
+ - test/test_htmlfilter.rb
38
+ - Rakefile
39
+ - Manifest.txt
40
+ - TODO
41
+ - README.rdoc
42
+ - History.rdoc
43
+ has_rdoc: true
44
+ homepage:
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --inline-source
50
+ - --title
51
+ - htmlfilter api
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ requirements: []
67
+
68
+ rubyforge_project: htmlfilter
69
+ rubygems_version: 1.3.5
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: HTML Filter library can be used to sanitize and sterilize HTML.
73
+ test_files:
74
+ - test/test_cssfilter.rb
75
+ - test/test_htmlfilter.rb