htmlfilter 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc ADDED
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2009-06-25
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
data/Manifest.txt ADDED
@@ -0,0 +1,19 @@
1
+ #!mast bin lib meta test [A-Z]*
2
+ lib
3
+ lib/cssfilter.rb
4
+ lib/htmlfilter
5
+ lib/htmlfilter/multiton.rb
6
+ lib/htmlfilter.rb
7
+ meta
8
+ meta/package
9
+ meta/project
10
+ meta/title
11
+ meta/version
12
+ test
13
+ test/test_cssfilter.rb
14
+ test/test_htmlfilter.rb
15
+ Rakefile
16
+ Manifest.txt
17
+ TODO
18
+ README.rdoc
19
+ History.rdoc
data/README.rdoc ADDED
@@ -0,0 +1,53 @@
1
+ = HtmlFilter
2
+
3
+ * http://rubyworks.github.com/htmlfilter
4
+
5
+ == DESCRIPTION:
6
+
7
+ HTML Filter library can be used to sanitize and sterilize
8
+ HTML. A good idea if you let users submit HTML in comments,
9
+ for instance.
10
+
11
+ This library also include CssFilter. The CssFilter class will
12
+ clean-up a cascading style sheet. It can be used to remove
13
+ whitespace and most importantly remove urls.
14
+
15
+ == FEATURES:
16
+
17
+ * Santize HTML
18
+ * Compress CSS
19
+
20
+ == SYNOPSIS:
21
+
22
+ Via the class.
23
+
24
+ html = "<<b>hello</b>"
25
+
26
+ HtmlFilter.new(options).filter(html)
27
+
28
+ Or using the String extension.
29
+
30
+ html.html_filter #=> "<b>hello</b>"
31
+
32
+ See RDocs for more information.
33
+
34
+ == REQUIREMENTS:
35
+
36
+ * Uses a copy of multiton.rb (included)
37
+
38
+ == INSTALL:
39
+
40
+ * sudo gem install htmlfilter
41
+
42
+ == LICENSE:
43
+
44
+ (Creative Commons Attribution-ShareAlike License)
45
+
46
+ Copyright (c) 2009 Thomas Sawyer
47
+
48
+ See http://creativecommons.org/licenses/by-sa/3.0/deed.en
49
+
50
+ HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>.
51
+ This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License.
52
+ See http://creativecommons.org/licenses/by-sa/2.5/.
53
+
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ # -*- ruby -*-
2
+
3
+ #$: << './lib'
4
+ #require 'rubygems'
5
+ #require 'hoe'
6
+ #require 'htmlfilter'
7
+ #Hoe.new('htmlfilter', HtmlFilter::VERSION) do |p|
8
+ # p.rubyforge_name = 'death' # if different than lowercase project name
9
+ # p.developer('Thomas Sawyer', 'transfire@gmail.com')
10
+ #end
11
+
12
+
13
+
14
+ # vim: syntax=Ruby
15
+
data/TODO ADDED
@@ -0,0 +1,7 @@
1
+ = TODO List
2
+
3
+ * Maybe write executable(s) to use library via commandline.
4
+ * Elaborate on Features list in README.txt.
5
+ * Rename class to HTMLFilter (instead of HtmlFilter)
6
+
7
+
data/lib/cssfilter.rb ADDED
@@ -0,0 +1,226 @@
1
+ # = CSS Filter
2
+ #
3
+ # The CssFilter class will clean up a cascading style sheet.
4
+ # It can be used to remove whitespace and most importantly
5
+ # remove urls.
6
+ #
7
+ # == Authors
8
+ #
9
+ # * Trans
10
+ #
11
+ # == Todo
12
+ #
13
+ # * Allow urls to be specified per attribute type.
14
+ #
15
+ # == Copying
16
+ #
17
+ # Copyright (c) 2007 7rans
18
+
19
+ #require 'htmlfilter/uri'
20
+ require 'uri'
21
+
22
+ # = CSS Filter
23
+ #
24
+ # The CssFilter class will clean up a cascading style sheet.
25
+ # It can be used to remove whitespace and most importantly
26
+ # remove urls.
27
+ #
28
+ class CssFilter
29
+ VERSION="1.0.0"
30
+
31
+ # should we remove comments? (true, false)
32
+ attr_accessor :strip_comments
33
+
34
+ # should we remove urls? (true, false)
35
+ attr_accessor :strip_urls
36
+
37
+ # url schemes which will be allowed (http, ftp, mailto)
38
+ attr_accessor :allowed_scheme
39
+
40
+ # alias for allowed_scheme
41
+ alias_method :allowed_protocols, :allowed_scheme
42
+ alias_method :allowed_protocols=, :allowed_scheme=
43
+
44
+ # url hosts which will be allowed.
45
+ attr_accessor :allowed_hosts
46
+
47
+ # urls which will be allowed. (NOT YET USED)
48
+ attr_accessor :allowed_urls
49
+
50
+ # substitue urls (NOT YET USED)
51
+ attr_accessor :substitute_urls
52
+
53
+ # remove blank lines.
54
+ attr_accessor :strip_whitespace
55
+
56
+ # remove blank lines.
57
+ attr_accessor :strip_blanklines
58
+
59
+ # Complete parse and rewrite of CSS document.
60
+ # This does a complete "cleaning" but note that
61
+ # is not yet a perfect parser.
62
+ attr_accessor :rewrite
63
+
64
+ # CssFilter option defaults.
65
+
66
+ DEFAULT = {
67
+ 'strip_comments' => true,
68
+ 'strip_urls' => true,
69
+ 'allowed_urls' => [],
70
+ 'allowed_hosts' => [],
71
+ 'allowed_scheme' => [],
72
+ 'strip_whitespace' => false,
73
+ 'strip_blanklines' => true,
74
+ 'rewrite' => false,
75
+ 'substitute_urls' => {}
76
+ }
77
+
78
+ #
79
+
80
+ def initialize(options=nil)
81
+ if options
82
+ h = DEFAULT.dup
83
+ options.each do |k,v|
84
+ h[k.to_s] = v
85
+ end
86
+ options = h
87
+ else
88
+ options = DEFAULT.dup
89
+ end
90
+
91
+ options.each{ |k,v| send("#{k}=",v) }
92
+ end
93
+
94
+ #
95
+
96
+ def accept_host(host)
97
+ @hosts << host
98
+ end
99
+
100
+ #
101
+
102
+ def filter(css)
103
+ css = remove_comments(css) if strip_comments
104
+ css = remove_urls(css) if strip_urls
105
+
106
+ css = remove_nullvalues(css)
107
+
108
+ css = remove_whitespace(css) if strip_whitespace
109
+ css = remove_blanklines(css) if strip_blanklines
110
+
111
+ css = parse(css).to_css if rewrite
112
+ css
113
+ end
114
+
115
+ #
116
+
117
+ def remove_comments(data)
118
+ data.gsub(/\/\*(.8?)\*\//,'')
119
+ end
120
+
121
+ # TODO: allowed_urls
122
+
123
+ def remove_urls(data)
124
+ urls = data.scan(/url\((.*?)\)/).flatten
125
+ uris = urls.collect{ |u| URI.extract(u) }.flatten
126
+ uris.each do |u|
127
+ uri = URI.parse(u)
128
+ unless allowed_hosts.include?(uri.host) or
129
+ allowed_scheme.include?(uri.scheme)
130
+ data.sub!(u.to_s, '')
131
+ end
132
+ end
133
+ data.gsub(/url\(\s*\)/, '')
134
+ end
135
+
136
+ #
137
+
138
+ def remove_whitespace(data)
139
+ data = data.gsub(/^\s*/,'')
140
+ data = data.gsub(/\s*$/,'')
141
+ end
142
+
143
+ #
144
+
145
+ def remove_blanklines(data)
146
+ data = data.gsub(/^\s*\n/,'')
147
+ end
148
+
149
+ #
150
+
151
+ def remove_nullvalues(data);
152
+ data = data.gsub(/\w+[:](\s+)[;]/,'')
153
+ end
154
+
155
+ # Breaks a css document up into a hash. This can be used
156
+ # completely rewritting the css.
157
+ #
158
+ # TODO: Not complete, does not work with "@xxx foo;" for example.
159
+
160
+ def parse(css)
161
+ tree = CssTree.new
162
+ entries = css.scan(/^(.*?)\{(.*?)\}/m)
163
+ entries.each do |ref, props|
164
+ tree[ref.strip] ||= {}
165
+ props = clean_properties(props)
166
+ props = props.scan(/(.*?)[:](.*?)([;]|\s*\Z)/)
167
+ props.each do |(key,val)|
168
+ tree[ref.strip][key.strip] = clean_value(val)
169
+ end
170
+ end
171
+ return tree
172
+ end
173
+
174
+ # Takes a css entry and ensures it is valid (as best it can).
175
+ # It will fix trival mistakes, and raise an error when it is
176
+ # beyond repair.
177
+ #
178
+ # TODO: So far this does absolutely nothing!
179
+
180
+ def clean_properties(atts)
181
+ atts
182
+ end
183
+
184
+ #
185
+
186
+ def clean_value(val)
187
+ val = val.strip
188
+
189
+ if urls
190
+ uris = URI.extract(val)
191
+ uris.each do |u|
192
+ val.sub!(u.to_s, urls)
193
+ end
194
+ end
195
+
196
+ return val
197
+ end
198
+
199
+ end
200
+
201
+
202
+ # CSS parse tree. This is for a "deep filtering".
203
+
204
+ class CssTree < Hash
205
+
206
+ def initialize(options=nil)
207
+ @options = options || {}
208
+ super()
209
+ end
210
+
211
+ # Re-output the CSS, all tidy ;)
212
+
213
+ def to_css
214
+ css = ""
215
+ each do |selector, entries|
216
+ css << "#{selector}{"
217
+ entries.each do |key, value|
218
+ css << "#{key}:#{value};"
219
+ end
220
+ css << "}\n"
221
+ end
222
+ return css
223
+ end
224
+
225
+ end
226
+
@@ -0,0 +1,386 @@
1
+ # = Multiton
2
+ #
3
+ # == Synopsis
4
+ #
5
+ # Multiton design pattern ensures only one object is allocated for a given state.
6
+ #
7
+ # The 'multiton' pattern is similar to a singleton, but instead of only one
8
+ # instance, there are several similar instances. It is useful when you want to
9
+ # avoid constructing objects many times because of some huge expense (connecting
10
+ # to a database for example), require a set of similar but not identical
11
+ # objects, and cannot easily control how many times a contructor may be called.
12
+ #
13
+ # class SomeMultitonClass
14
+ # include Multiton
15
+ # attr :arg
16
+ # def initialize(arg)
17
+ # @arg = arg
18
+ # end
19
+ # end
20
+ #
21
+ # a = SomeMultitonClass.new(4)
22
+ # b = SomeMultitonClass.new(4) # a and b are same object
23
+ # c = SomeMultitonClass.new(2) # c is a different object
24
+ #
25
+ # == Previous Behavior
26
+ #
27
+ # In previous versions of Multiton the #new method was made
28
+ # private and #instance had to be used in its stay --just like Singleton.
29
+ # But this is less desirable for Multiton since Multitions can
30
+ # have multiple instances, not just one.
31
+ #
32
+ # So instead Multiton now defines #create as a private alias of
33
+ # the original #new method (just in case it is needed) and then
34
+ # defines #new to handle the multiton; #instance is provided
35
+ # as an alias for it.
36
+ #
37
+ #--
38
+ # So if you must have the old behavior, all you need do is re-alias
39
+ # #new to #create and privatize it.
40
+ #
41
+ # class SomeMultitonClass
42
+ # include Multiton
43
+ # alias_method :new, :create
44
+ # private :new
45
+ # ...
46
+ # end
47
+ #
48
+ # Then only #instance will be available for creating the Multiton.
49
+ #++
50
+ #
51
+ # == How It Works
52
+ #
53
+ # A pool of objects is searched for a previously cached object,
54
+ # if one is not found we construct one and cache it in the pool
55
+ # based on class and the args given to the contructor.
56
+ #
57
+ # A limitation of this approach is that it is impossible to
58
+ # detect if different blocks were given to a contructor (if it takes a
59
+ # block). So it is the constructor arguments _only_ which determine
60
+ # the uniqueness of an object. To workaround this, define the _class_
61
+ # method ::multiton_id.
62
+ #
63
+ # def Klass.multiton_id(*args, &block)
64
+ # # ...
65
+ # end
66
+ #
67
+ # Which should return a hash key used to identify the object being
68
+ # constructed as (not) unique.
69
+ #
70
+ # == Authors
71
+ #
72
+ # * Christoph Rippel
73
+ # * Thomas Sawyer
74
+ #
75
+ # = Copying
76
+ #
77
+ # Copyright (c) 2007 Christoph Rippel, Thomas Sawyer
78
+ #
79
+ # Ruby License
80
+ #
81
+ # This module is free software. You may use, modify, and/or redistribute this
82
+ # software under the same terms as Ruby.
83
+ #
84
+ # This program is distributed in the hope that it will be useful, but WITHOUT
85
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
86
+ # FOR A PARTICULAR PURPOSE.
87
+
88
+ require 'thread'
89
+
90
+ # = Multiton
91
+ #
92
+ # Multiton design pattern ensures only one object is allocated for a given state.
93
+ #
94
+ # The 'multiton' pattern is similar to a singleton, but instead of only one
95
+ # instance, there are several similar instances. It is useful when you want to
96
+ # avoid constructing objects many times because of some huge expense (connecting
97
+ # to a database for example), require a set of similar but not identical
98
+ # objects, and cannot easily control how many times a contructor may be called.
99
+ #
100
+ # class SomeMultitonClass
101
+ # include Multiton
102
+ # attr :arg
103
+ # def initialize(arg)
104
+ # @arg = arg
105
+ # end
106
+ # end
107
+ #
108
+ # a = SomeMultitonClass.new(4)
109
+ # b = SomeMultitonClass.new(4) # a and b are same object
110
+ # c = SomeMultitonClass.new(2) # c is a different object
111
+ #
112
+ # == How It Works
113
+ #
114
+ # A pool of objects is searched for a previously cached object,
115
+ # if one is not found we construct one and cache it in the pool
116
+ # based on class and the args given to the contructor.
117
+ #
118
+ # A limitation of this approach is that it is impossible to
119
+ # detect if different blocks were given to a contructor (if it takes a
120
+ # block). So it is the constructor arguments _only_ which determine
121
+ # the uniqueness of an object. To workaround this, define the _class_
122
+ # method ::multiton_id.
123
+ #
124
+ # def Klass.multiton_id(*args, &block)
125
+ # # ...
126
+ # end
127
+ #
128
+ # Which should return a hash key used to identify the object being
129
+ # constructed as (not) unique.
130
+
131
+ module Multiton
132
+
133
+ # disable build-in copying methods
134
+
135
+ def clone
136
+ raise TypeError, "can't clone Multiton #{self}"
137
+ #self
138
+ end
139
+
140
+ def dup
141
+ raise TypeError, "can't dup Multiton #{self}"
142
+ #self
143
+ end
144
+
145
+ # default marshalling strategy
146
+
147
+ protected
148
+
149
+ def _dump(depth=-1)
150
+ Marshal.dump(@multiton_initializer)
151
+ end
152
+
153
+ # Mutex to safely store multiton instances.
154
+
155
+ class InstanceMutex < Hash #:nodoc:
156
+ def initialize
157
+ @global = Mutex.new
158
+ end
159
+
160
+ def initialized(arg)
161
+ store(arg, DummyMutex)
162
+ end
163
+
164
+ def (DummyMutex = Object.new).synchronize
165
+ yield
166
+ end
167
+
168
+ def default(arg)
169
+ @global.synchronize{ fetch(arg){ store(arg, Mutex.new) } }
170
+ end
171
+ end
172
+
173
+ # Multiton can be included in another module, in which case that module effectively becomes
174
+ # a multiton behavior distributor too. This is why we propogate #included to the base module.
175
+ # by putting it in another module.
176
+ #
177
+ #--
178
+ # def append_features(mod)
179
+ # # help out people counting on transitive mixins
180
+ # unless mod.instance_of?(Class)
181
+ # raise TypeError, "Inclusion of Multiton in module #{mod}"
182
+ # end
183
+ # super
184
+ # end
185
+ #++
186
+
187
+ module Inclusive
188
+ private
189
+ def included(base)
190
+ class << base
191
+ #alias_method(:new!, :new) unless method_defined?(:new!)
192
+ # gracefully handle multiple inclusions of Multiton
193
+ unless include?(Multiton::MetaMethods)
194
+ alias_method :new!, :new
195
+ private :allocate #, :new
196
+ include Multiton::MetaMethods
197
+
198
+ if method_defined?(:marshal_dump)
199
+ undef_method :marshal_dump
200
+ warn "warning: marshal_dump was undefined since it is incompatible with the Multiton pattern"
201
+ end
202
+ end
203
+ end
204
+ end
205
+ end
206
+
207
+ extend Inclusive
208
+
209
+ #
210
+
211
+ module MetaMethods
212
+
213
+ include Inclusive
214
+
215
+ def instance(*e, &b)
216
+ arg = multiton_id(*e, &b)
217
+ multiton_instance.fetch(arg) do
218
+ multiton_mutex[arg].synchronize do
219
+ multiton_instance.fetch(arg) do
220
+ val = multiton_instance[arg] = new!(*e, &b) #new(*e, &b)
221
+ val.instance_variable_set(:@multiton_initializer, e, &b)
222
+ multiton_mutex.initialized(arg)
223
+ val
224
+ end
225
+ end
226
+ end
227
+ end
228
+ alias_method :new, :instance
229
+
230
+ def initialized?(*e, &b)
231
+ multiton_instance.key?(multiton_id(*e, &b))
232
+ end
233
+
234
+ protected
235
+
236
+ def multiton_instance
237
+ @multiton_instance ||= Hash.new
238
+ end
239
+
240
+ def multiton_mutex
241
+ @multiton_mutex ||= InstanceMutex.new
242
+ end
243
+
244
+ def reinitialize
245
+ multiton_instance.clear
246
+ multiton_mutex.clear
247
+ end
248
+
249
+ def _load(str)
250
+ instance(*Marshal.load(str))
251
+ end
252
+
253
+ private
254
+
255
+ # Default method to to create a key to cache already constructed
256
+ # instances. In the use case MultitonClass.new(e), MultiClass.new(f)
257
+ # must be semantically equal if multiton_id(e).eql?(multiton_id(f))
258
+ # evaluates to true.
259
+ def multiton_id(*e, &b)
260
+ e
261
+ end
262
+
263
+ def singleton_method_added(sym)
264
+ super
265
+ if (sym == :marshal_dump) & singleton_methods.include?('marshal_dump')
266
+ raise TypeError, "Don't use marshal_dump - rely on _dump and _load instead"
267
+ end
268
+ end
269
+
270
+ end
271
+
272
+ end
273
+
274
+
275
+
276
+
277
+ =begin
278
+ # TODO Convert this into a real test and/or benchmark.
279
+
280
+ if $0 == __FILE__
281
+
282
+ ### Simple marshalling test #######
283
+ class A
284
+ def initialize(a,*e)
285
+ @e = a
286
+ end
287
+
288
+ include Multiton
289
+ begin
290
+ def self.marshal_dump(depth = -1)
291
+ end
292
+ rescue => mes
293
+ p mes
294
+ class << self; undef marshal_dump end
295
+ end
296
+ end
297
+
298
+ C = Class.new(A.clone)
299
+ s = C.instance('a','b')
300
+
301
+ raise unless Marshal.load(Marshal.dump(s)) == s
302
+
303
+
304
+ ### Interdependent initialization example and threading benchmark ###
305
+
306
+ class Regular_SymPlane
307
+ def self.multiton_id(*e)
308
+ a,b = e
309
+ (a+b - 1)*(a+b )/2 + (a > b ? a : b)
310
+ end
311
+
312
+ def initialize(a,b)
313
+ klass = self.class
314
+ if a < b
315
+ @l = b > 0 ? klass.instance(a,b-1) : nil
316
+ @r = a > 0 ? klass.instance(a-1,b) : nil
317
+ else
318
+ @l = a > 0 ? klass.instance(a-1,b) : nil
319
+ @r = b > 0 ? klass.instance(a,b-1) : nil
320
+ end
321
+ end
322
+
323
+ include Multiton
324
+ end
325
+
326
+
327
+
328
+ def nap
329
+ # Thread.pass
330
+ sleep(rand(0.01))
331
+ end
332
+
333
+ class SymPlane < Regular_SymPlane
334
+ @m = Mutex.new
335
+ @count = 0
336
+ end
337
+
338
+ class << SymPlane
339
+ attr_reader :count
340
+ def reinitialize
341
+ super
342
+ @m = Mutex.new
343
+ @count = 0
344
+ end
345
+ def inherited(sub_class)
346
+ super
347
+ sub_class.instance_eval { @m = Mutex.new; @count = 0 }
348
+ end
349
+
350
+ def multiton_id(*e)
351
+ nap()
352
+ super
353
+ end
354
+
355
+ def new!(*e) # NOTICE!!!
356
+ super
357
+ ensure
358
+ nap()
359
+ @m.synchronize { p @count if (@count += 1) % 15 == 0 }
360
+ end
361
+
362
+ def run(k)
363
+ threads = 0
364
+ max = k * (k+1) / 2
365
+ puts ""
366
+ while count() < max
367
+ Thread.new { threads+= 1; instance(rand(30),rand(30)) }
368
+ end
369
+ puts "\nThe simulation created #{threads} threads"
370
+ end
371
+ end
372
+
373
+
374
+ require 'benchmark'
375
+ include Benchmark
376
+
377
+ bmbm do |x|
378
+ x.report('Initialize 465 SymPlane instances') { SymPlane.run(30) }
379
+ x.report('Reinitialize ') do
380
+ sleep 3
381
+ SymPlane.reinitialize
382
+ end
383
+ end
384
+
385
+ end
386
+ =end
data/lib/htmlfilter.rb ADDED
@@ -0,0 +1,516 @@
1
+ # = HTML Filter
2
+ #
3
+ # HTML Filter library can be used to sanitize and sterilize
4
+ # HTML. A good idea if you let users submit HTML in comments,
5
+ # for instance.
6
+ #
7
+ # HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>
8
+ #
9
+ # This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
10
+ # http://creativecommons.org/licenses/by-sa/2.5/
11
+ #
12
+ # Thanks to Jang Kim for adding support for single quoted attributes.
13
+ #
14
+ # == Reference
15
+ #
16
+ # * http://iamcal.com/publish/articles/php/processing_html/
17
+ # * http://iamcal.com/publish/articles/php/processing_html_part_2/
18
+ #
19
+ # == Author(s)
20
+ #
21
+ # * Trans
22
+ # * George Moschovitis
23
+ # * James Britt
24
+ # * Cal Henderson
25
+ # * Jang Kim
26
+ #
27
+ # == Copying
28
+ #
29
+ # Copyright (c) 2007 Trans
30
+
31
+ require 'htmlfilter/multiton.rb'
32
+
33
+ # = HtmlFilter
34
+ #
35
+ # HTML Filter library can be used to sanitize and sterilize
36
+ # HTML. A good idea if you let users submit HTML in comments,
37
+ # for instance.
38
+ #
39
+ # lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>
40
+ #
41
+ # This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
42
+ # http://creativecommons.org/licenses/by-sa/2.5/
43
+ #
44
+ # Thanks to Jang Kim for adding support for single quoted attributes.
45
+ #
46
+ # == Reference
47
+ #
48
+ # * http://iamcal.com/publish/articles/php/processing_html/
49
+ # * http://iamcal.com/publish/articles/php/processing_html_part_2/
50
+
51
+ class HtmlFilter
52
+ VERSION = "1.0.0"
53
+
54
+ include Multiton
55
+
56
+ # tags and attributes that are allowed
57
+ #
58
+ # Eg.
59
+ #
60
+ # {
61
+ # 'a' => ['href', 'target'],
62
+ # 'b' => [],
63
+ # 'img' => ['src', 'width', 'height', 'alt']
64
+ # }
65
+ attr_accessor :allowed
66
+
67
+ # tags which should always be self-closing (e.g. "<img />")
68
+ attr_accessor :no_close
69
+
70
+ # tags which must always have seperate opening and closing
71
+ # tags (e.g. "<b></b>")
72
+ attr_accessor :always_close
73
+
74
+ # attributes which should be checked for valid protocols
75
+ # (src,href)
76
+ attr_accessor :protocol_attributes
77
+
78
+ # protocols which are allowed (http, ftp, mailto)
79
+ attr_accessor :allowed_protocols
80
+
81
+ # tags which should be removed if they contain no content
82
+ # (e.g. "<b></b>" or "<b />")
83
+ attr_accessor :remove_blanks
84
+
85
+ # should we remove comments? (true, false)
86
+ attr_accessor :strip_comments
87
+
88
+ # should we try and make a b tag out of "b>" (true, false)
89
+ attr_accessor :always_make_tags
90
+
91
+ # entity control option (true, false)
92
+ attr_accessor :allow_numbered_entities
93
+
94
+ # entity control option (amp, gt, lt, quot, etc.)
95
+ attr_accessor :allowed_entities
96
+
97
+ # default settings
98
+
99
+ DEFAULT = {
100
+ 'allowed' => {
101
+ 'a' => ['href', 'target'],
102
+ 'b' => [],
103
+ 'i' => [],
104
+ 'img' => ['src', 'width', 'height', 'alt']
105
+ },
106
+ 'no_close' => ['img', 'br', 'hr'],
107
+ 'always_close' => ['a', 'b'],
108
+ 'protocol_attributes' => ['src', 'href'],
109
+ 'allowed_protocols' => ['http', 'ftp', 'mailto'],
110
+ 'remove_blanks' => ['a', 'b'],
111
+ 'strip_comments' => true,
112
+ 'always_make_tags' => true,
113
+ 'allow_numbered_entities' => true,
114
+ 'allowed_entities' => ['amp', 'gt', 'lt', 'quot']
115
+ }
116
+
117
+ # New html filter.
118
+
119
+ def initialize( options=nil )
120
+ if options
121
+ h = DEFAULT.dup
122
+ options.each do |k,v|
123
+ h[k.to_s] = v
124
+ end
125
+ options = h
126
+ else
127
+ options = DEFAULT.dup
128
+ end
129
+
130
+ options.each{ |k,v| send("#{k}=",v) }
131
+ end
132
+
133
+ # Filter html string.
134
+
135
+ def filter(data)
136
+ @tag_counts = {}
137
+
138
+ data = escape_comments(data)
139
+ data = balance_html(data)
140
+ data = check_tags(data)
141
+ data = process_remove_blanks(data)
142
+ data = validate_entities(data)
143
+
144
+ return data
145
+ end
146
+
147
+ private
148
+
149
+ #
150
+ # internal tag counter
151
+ #
152
+
153
+ def tag_counts ; @tag_counts; end
154
+
155
+ #
156
+ #
157
+ #
158
+
159
+ def escape_comments(data)
160
+ data = data.gsub(/<!--(.*?)-->/s) do
161
+ '<!--' + escape_special_chars(strip_single($1)) + '-->'
162
+ end
163
+
164
+ return data
165
+ end
166
+
167
+ #
168
+ #
169
+ #
170
+
171
+ def balance_html(data)
172
+ data = data.dup
173
+
174
+ if always_make_tags
175
+ # try and form html
176
+ data.gsub!(/>>+/, '>')
177
+ data.gsub!(/<<+/, '<')
178
+ data.gsub!(/^>/, '')
179
+ data.gsub!(/<([^>]*?)(?=<|$)/, '<\1>')
180
+ data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1<\2')
181
+ else
182
+ # escape stray brackets
183
+ data.gsub!(/<([^>]*?)(?=<|$)/, '&lt;\1')
184
+ data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1\2&gt;<')
185
+ # the last regexp causes '<>' entities to appear
186
+ # (we need to do a lookahead assertion so that the last bracket
187
+ # can be used in the next pass of the regexp)
188
+ data.gsub!('<>', '')
189
+ end
190
+
191
+ return data
192
+ end
193
+
194
+ #
195
+ #
196
+ #
197
+
198
+ def check_tags(data)
199
+ data = data.dup
200
+
201
+ data.gsub!(/<(.*?)>/s){
202
+ process_tag(strip_single($1))
203
+ }
204
+
205
+ tag_counts.each do |tag, cnt|
206
+ cnt.times{ data << "</#{tag}>" }
207
+ end
208
+
209
+ return data
210
+ end
211
+
212
+ #
213
+ #
214
+ #
215
+
216
+ def process_tag(data)
217
+
218
+ # ending tags
219
+
220
+ re = /^\/([a-z0-9]+)/si
221
+
222
+ if matches = re.match(data)
223
+ name = matches[1].downcase
224
+ if allowed.key?(name)
225
+ unless no_close.include?(name)
226
+ if tag_counts[name]
227
+ tag_counts[name] -= 1
228
+ return "</#{name}>"
229
+ end
230
+ end
231
+ else
232
+ return ''
233
+ end
234
+ end
235
+
236
+ # starting tags
237
+
238
+ re = /^([a-z0-9]+)(.*?)(\/?)$/si
239
+
240
+ if matches = re.match(data)
241
+ name = matches[1].downcase
242
+ body = matches[2]
243
+ ending = matches[3]
244
+
245
+ if allowed.key?(name)
246
+ params = ""
247
+
248
+ matches_2 = body.scan(/([a-z0-9]+)=(["'])(.*?)\2/si) # <foo a="b" />
249
+ matches_1 = body.scan(/([a-z0-9]+)(=)([^"\s']+)/si) # <foo a=b />
250
+ matches_3 = body.scan(/([a-z0-9]+)=(["'])([^"']*?)\s*$/si) # <foo a="b />
251
+
252
+ matches = matches_1 + matches_2 + matches_3
253
+
254
+ matches.each do |match|
255
+ pname = match[0].downcase
256
+ if allowed[name].include?(pname)
257
+ value = match[2]
258
+ if protocol_attributes.include?(pname)
259
+ value = process_param_protocol(value)
260
+ end
261
+ params += %{ #{pname}="#{value}"}
262
+ end
263
+ end
264
+ if no_close.include?(name)
265
+ ending = ' /'
266
+ end
267
+ if always_close.include?(name)
268
+ ending = ''
269
+ end
270
+ if ending.empty?
271
+ if tag_counts.key?(name)
272
+ tag_counts[name] += 1
273
+ else
274
+ tag_counts[name] = 1
275
+ end
276
+ end
277
+ unless ending.empty?
278
+ ending = ' /'
279
+ end
280
+ return '<' + name + params + ending + '>'
281
+ else
282
+ return ''
283
+ end
284
+ end
285
+
286
+ # comments
287
+ if /^!--(.*)--$/si =~ data
288
+ if strip_comments
289
+ return ''
290
+ else
291
+ return '<' + data + '>'
292
+ end
293
+ end
294
+
295
+ # garbage, ignore it
296
+ return ''
297
+ end
298
+
299
+ #
300
+ #
301
+ #
302
+
303
+ def process_param_protocol(data)
304
+ data = decode_entities(data)
305
+
306
+ re = /^([^:]+)\:/si
307
+
308
+ if matches = re.match(data)
309
+ unless allowed_protocols.include?(matches[1])
310
+ #data = '#'.substr(data, strlen(matches[1])+1)
311
+ data = '#' + data[0..matches[1].size+1]
312
+ end
313
+ end
314
+
315
+ return data
316
+ end
317
+
318
+ #
319
+ #
320
+ #
321
+
322
+ def process_remove_blanks(data)
323
+ data = data.dup
324
+
325
+ remove_blanks.each do |tag|
326
+ data.gsub!(/<#{tag}(\s[^>]*)?><\/#{tag}>/, '')
327
+ data.gsub!(/<#{tag}(\s[^>]*)?\/>/, '')
328
+ end
329
+
330
+ return data
331
+ end
332
+
333
+ #
334
+ #
335
+ #
336
+
337
+ def fix_case(data)
338
+ data_notags = strip_tags(data)
339
+ data_notags = data_notags.gsub(/[^a-zA-Z]/, '')
340
+
341
+ if data_notags.size < 5
342
+ return data
343
+ end
344
+
345
+ if /[a-z]/ =~ data_notags
346
+ return data
347
+ end
348
+
349
+ data = data.gsub(/(>|^)([^<]+?)(<|$)/s){
350
+ strip_single($1) +
351
+ fix_case_inner(strip_single($2)) +
352
+ strip_single($3)
353
+ }
354
+
355
+ return data
356
+ end
357
+
358
+ #
359
+ #
360
+ #
361
+
362
+ def fix_case_inner(data)
363
+ data = data.dup
364
+
365
+ data.downcase!
366
+
367
+ data.gsub!(/(^|[^\w\s\';,\\-])(\s*)([a-z])/){
368
+ strip_single("#{$1}#{$2}") + strip_single($3).upcase
369
+ }
370
+
371
+ return data
372
+ end
373
+
374
+ #
375
+ #
376
+ #
377
+
378
+ def validate_entities(data)
379
+ data = data.dup
380
+
381
+ # validate entities throughout the string
382
+ data.gsub!(%r!&([^&;]*)(?=(;|&|$))!){
383
+ check_entity(strip_single($1), strip_single($2))
384
+ }
385
+
386
+ # validate quotes outside of tags
387
+ data.gsub!(/(>|^)([^<]+?)(<|$)/s){
388
+ m1, m2, m3 = $1, $2, $3
389
+ strip_single(m1) +
390
+ strip_single(m2).gsub('\"', '&quot;') +
391
+ strip_single(m3)
392
+ }
393
+
394
+ return data
395
+ end
396
+
397
+ #
398
+ #
399
+ #
400
+
401
+ def check_entity(preamble, term)
402
+ if term != ';'
403
+ return '&amp;' + preamble
404
+ end
405
+
406
+ if is_valid_entity(preamble)
407
+ return '&' + preamble
408
+ end
409
+
410
+ return '&amp;' + preamble
411
+ end
412
+
413
+ #
414
+ #
415
+ #
416
+
417
+ def is_valid_entity(entity)
418
+ re = /^#([0-9]+)$/i
419
+
420
+ if md = re.match(entity)
421
+ if (md[1].to_i > 127)
422
+ return true
423
+ end
424
+ return allow_numbered_entities
425
+ end
426
+
427
+ if allowed_entities.include?(entity)
428
+ return true
429
+ end
430
+
431
+ return nil
432
+ end
433
+
434
+ # within attributes, we want to convert all hex/dec/url
435
+ # escape sequences into their raw characters so that we can
436
+ # check we don't get stray quotes/brackets inside strings.
437
+
438
+ def decode_entities(data)
439
+ data = data.dup
440
+
441
+ data.gsub!(/(&)#(\d+);?/){ decode_dec_entity($1, $2) }
442
+ data.gsub!(/(&)#x([0-9a-f]+);?/i){ decode_hex_entity($1, $2) }
443
+ data.gsub!(/(%)([0-9a-f]{2});?/i){ decode_hex_entity($1, $2) }
444
+
445
+ data = validate_entities(data)
446
+
447
+ return data
448
+ end
449
+
450
+ #
451
+ #
452
+ #
453
+
454
+ def decode_hex_entity(*m)
455
+ return decode_num_entity(m[1], m[2].to_i.to_s(16))
456
+ end
457
+
458
+ #
459
+ #
460
+ #
461
+
462
+ def decode_dec_entity(*m)
463
+ return decode_num_entity(m[1], m[2])
464
+ end
465
+
466
+ #
467
+ #
468
+ #
469
+
470
+ def decode_num_entity(orig_type, d)
471
+ d = d.to_i
472
+ d = 32 if d < 0 # space
473
+
474
+ # don't mess with high chars
475
+ if d > 127
476
+ return '%' + d.to_s(16) if orig_type == '%'
477
+ return "&#{d};" if orig_type == '&'
478
+ end
479
+
480
+ return escape_special_chars(d.chr)
481
+ end
482
+
483
+ #
484
+ #
485
+ #
486
+
487
+ def strip_single(data)
488
+ return data.gsub('\"', '"').gsub('\0', 0.chr)
489
+ end
490
+
491
+ # Certain characters have special significance in HTML, and
492
+ # should be represented by HTML entities if they are to
493
+ # preserve their meanings. This function returns a string
494
+ # with some of these conversions made; the translations made
495
+ # are those most useful for everyday web programming.
496
+
497
+ def escape_special_chars(data)
498
+ data = data.dup
499
+ data.gsub!( /&/n , '&amp;' )
500
+ data.gsub!( /\"/n , '&quot;' )
501
+ data.gsub!( />/n , '&gt;' )
502
+ data.gsub!( /</n , '&lt;' )
503
+ data.gsub!( /'/ , '&#039;' )
504
+ return data
505
+ end
506
+
507
+ end
508
+
509
+ # Overload the standard String class for extra convienience.
510
+
511
+ class String
512
+ def html_filter(*opts)
513
+ HtmlFilter.new(*opts).filter(self)
514
+ end
515
+ end
516
+
data/meta/package ADDED
@@ -0,0 +1 @@
1
+ htmlfilter
data/meta/project ADDED
@@ -0,0 +1 @@
1
+ rubyworks
data/meta/title ADDED
@@ -0,0 +1 @@
1
+ HTMLFilter
data/meta/version ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,35 @@
1
+ require "test/unit"
2
+ require "cssfilter"
3
+ #require 'yaml'
4
+
5
+ class TestCssFilter < Test::Unit::TestCase
6
+
7
+ def setup
8
+ @css = <<-END
9
+ * {
10
+ margin: 0;
11
+ height: 0;
12
+ }
13
+
14
+ body {
15
+ margin: 0;
16
+ height: 0;
17
+ background: url(http://xzy.org);
18
+ }
19
+
20
+ h1 {
21
+ trythis: url(http://here.org/fun.js);
22
+ font-size: 12pt;
23
+ }
24
+ END
25
+ @result = "* {\nmargin: 0;\nheight: 0;\n}\nbody {\nmargin: 0;\nheight: 0;\n}\nh1 {\ntrythis: url(http://here.org/fun.js);\nfont-size: 12pt;\n}"
26
+ end
27
+
28
+ def test_filter
29
+ cssfilter = CssFilter.new(:allowed_hosts=>["here.org"], :strip_whitespace => true)
30
+ csstree = cssfilter.filter(@css)
31
+ assert_equal(@result, csstree.to_s)
32
+ end
33
+
34
+ end
35
+
@@ -0,0 +1,70 @@
1
+ require "test/unit"
2
+ require "htmlfilter"
3
+
4
+ class TestHtmlFilter < Test::Unit::TestCase
5
+
6
+ # core tests
7
+
8
+ def test_multiton_without_options
9
+ h1 = HtmlFilter.new
10
+ h2 = HtmlFilter.new
11
+ h3 = HtmlFilter.new( :strip_comments => false )
12
+ assert_equal( h1.object_id, h2.object_id )
13
+ assert_not_equal( h1.object_id, h3.object_id )
14
+ end
15
+
16
+ def test_multiton_with_options
17
+ h1 = HtmlFilter.new( :strip_comments => false )
18
+ h2 = HtmlFilter.new( :strip_comments => false )
19
+ h3 = HtmlFilter.new
20
+ assert_equal( h1.object_id, h2.object_id )
21
+ assert_not_equal( h1.object_id, h3.object_id )
22
+ end
23
+
24
+ def test_strip_single
25
+ hf = HtmlFilter.new
26
+ assert_equal( '"', hf.send(:strip_single,'\"') )
27
+ assert_equal( "\000", hf.send(:strip_single,'\0') )
28
+ end
29
+
30
+ # functional tests
31
+
32
+ def assert_filter(filtered, original)
33
+ assert_equal(filtered, original.html_filter)
34
+ end
35
+
36
+ def test_fix_quotes
37
+ assert_filter '<img src="foo.jpg" />', "<img src=\"foo.jpg />"
38
+ end
39
+
40
+ def test_basics
41
+ assert_filter '', ''
42
+ assert_filter 'hello', 'hello'
43
+ end
44
+
45
+ def test_balancing_tags
46
+ assert_filter "<b>hello</b>", "<<b>hello</b>"
47
+ assert_filter "<b>hello</b>", "<b>>hello</b>"
48
+ assert_filter "<b>hello</b>", "<b>hello<</b>"
49
+ assert_filter "<b>hello</b>", "<b>hello</b>>"
50
+ assert_filter "", "<>"
51
+ end
52
+
53
+ def test_tag_completion
54
+ assert_filter "hello", "hello<b>"
55
+ assert_filter "<b>hello</b>", "<b>hello"
56
+ assert_filter "hello<b>world</b>", "hello<b>world"
57
+ assert_filter "hello", "hello</b>"
58
+ assert_filter "hello", "hello<b/>"
59
+ assert_filter "hello<b>world</b>", "hello<b/>world"
60
+ assert_filter "<b><b><b>hello</b></b></b>", "<b><b><b>hello"
61
+ assert_filter "", "</b><b>"
62
+ end
63
+
64
+ def test_end_slashes
65
+ assert_filter '<img />', '<img>'
66
+ assert_filter '<img />', '<img/>'
67
+ assert_filter '', '<b/></b>'
68
+ end
69
+
70
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: htmlfilter
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors: []
7
+
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-22 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: HTML Filter library can be used to sanitize and sterilize HTML. A good idea if you let users submit HTML in comments, for instance. This library also include CssFilter. The CssFilter class will clean-up a cascading style sheet. It can be used to remove whitespace and most importantly remove urls.
17
+ email:
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - Rakefile
24
+ - Manifest.txt
25
+ - TODO
26
+ - README.rdoc
27
+ - History.rdoc
28
+ files:
29
+ - lib/cssfilter.rb
30
+ - lib/htmlfilter/multiton.rb
31
+ - lib/htmlfilter.rb
32
+ - meta/package
33
+ - meta/project
34
+ - meta/title
35
+ - meta/version
36
+ - test/test_cssfilter.rb
37
+ - test/test_htmlfilter.rb
38
+ - Rakefile
39
+ - Manifest.txt
40
+ - TODO
41
+ - README.rdoc
42
+ - History.rdoc
43
+ has_rdoc: true
44
+ homepage:
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --inline-source
50
+ - --title
51
+ - htmlfilter api
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ requirements: []
67
+
68
+ rubyforge_project: htmlfilter
69
+ rubygems_version: 1.3.5
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: HTML Filter library can be used to sanitize and sterilize HTML.
73
+ test_files:
74
+ - test/test_cssfilter.rb
75
+ - test/test_htmlfilter.rb