rails-deprecated_sanitizer 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,188 @@
1
+ require 'set'
2
+ require 'cgi'
3
+ require 'active_support/core_ext/module/attribute_accessors'
4
+
5
+ module HTML
6
+ class Sanitizer
7
+ def sanitize(text, options = {})
8
+ validate_options(options)
9
+ return text unless sanitizeable?(text)
10
+ tokenize(text, options).join
11
+ end
12
+
13
+ def sanitizeable?(text)
14
+ !(text.nil? || text.empty? || !text.index("<"))
15
+ end
16
+
17
+ protected
18
+ def tokenize(text, options)
19
+ tokenizer = HTML::Tokenizer.new(text)
20
+ result = []
21
+ while token = tokenizer.next
22
+ node = Node.parse(nil, 0, 0, token, false)
23
+ process_node node, result, options
24
+ end
25
+ result
26
+ end
27
+
28
+ def process_node(node, result, options)
29
+ result << node.to_s
30
+ end
31
+
32
+ def validate_options(options)
33
+ if options[:tags] && !options[:tags].is_a?(Enumerable)
34
+ raise ArgumentError, "You should pass :tags as an Enumerable"
35
+ end
36
+
37
+ if options[:attributes] && !options[:attributes].is_a?(Enumerable)
38
+ raise ArgumentError, "You should pass :attributes as an Enumerable"
39
+ end
40
+ end
41
+ end
42
+
43
+ class FullSanitizer < Sanitizer
44
+ def sanitize(text, options = {})
45
+ result = super
46
+ # strip any comments, and if they have a newline at the end (ie. line with
47
+ # only a comment) strip that too
48
+ result = result.gsub(/<!--(.*?)-->[\n]?/m, "") if (result && result =~ /<!--(.*?)-->[\n]?/m)
49
+ # Recurse - handle all dirty nested tags
50
+ result == text ? result : sanitize(result, options)
51
+ end
52
+
53
+ def process_node(node, result, options)
54
+ result << node.to_s if node.class == HTML::Text
55
+ end
56
+ end
57
+
58
+ class LinkSanitizer < FullSanitizer
59
+ cattr_accessor :included_tags, :instance_writer => false
60
+ self.included_tags = Set.new(%w(a href))
61
+
62
+ def sanitizeable?(text)
63
+ !(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
64
+ end
65
+
66
+ protected
67
+ def process_node(node, result, options)
68
+ result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
69
+ end
70
+ end
71
+
72
+ class WhiteListSanitizer < Sanitizer
73
+ [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
74
+ :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
75
+ class_attribute attr, :instance_writer => false
76
+ end
77
+
78
+ # A regular expression of the valid characters used to separate protocols like
79
+ # the ':' in 'http://foo.com'
80
+ self.protocol_separator = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i
81
+
82
+ # Specifies a Set of HTML attributes that can have URIs.
83
+ self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
84
+
85
+ # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
86
+ # to just escaping harmless tags like &lt;font&gt;
87
+ self.bad_tags = Set.new(%w(script))
88
+
89
+ # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
90
+ self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
91
+ sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
92
+ acronym a img blockquote del ins))
93
+
94
+ # Specifies the default Set of html attributes that the #sanitize helper will leave
95
+ # in the allowed tag.
96
+ self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
97
+
98
+ # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
99
+ self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
100
+ feed svn urn aim rsync tag ssh sftp rtsp afs))
101
+
102
+ # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
103
+ self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
104
+ border-color border-left-color border-right-color border-top-color clear color cursor direction display
105
+ elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
106
+ overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
107
+ speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
108
+ width))
109
+
110
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
111
+ self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
112
+ collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
113
+ nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
114
+
115
+ # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
116
+ self.shorthand_css_properties = Set.new(%w(background border margin padding))
117
+
118
+ # Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute
119
+ def sanitize_css(style)
120
+ # disallow urls
121
+ style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
122
+
123
+ # gauntlet
124
+ if style !~ /\A([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*\z/ ||
125
+ style !~ /\A(\s*[-\w]+\s*:\s*[^:;]*(;|$)\s*)*\z/
126
+ return ''
127
+ end
128
+
129
+ clean = []
130
+ style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
131
+ if allowed_css_properties.include?(prop.downcase)
132
+ clean << prop + ': ' + val + ';'
133
+ elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
134
+ unless val.split().any? do |keyword|
135
+ !allowed_css_keywords.include?(keyword) &&
136
+ keyword !~ /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
137
+ end
138
+ clean << prop + ': ' + val + ';'
139
+ end
140
+ end
141
+ end
142
+ clean.join(' ')
143
+ end
144
+
145
+ protected
146
+ def tokenize(text, options)
147
+ options[:parent] = []
148
+ options[:attributes] ||= allowed_attributes
149
+ options[:tags] ||= allowed_tags
150
+ super
151
+ end
152
+
153
+ def process_node(node, result, options)
154
+ result << case node
155
+ when HTML::Tag
156
+ if node.closing == :close
157
+ options[:parent].shift
158
+ else
159
+ options[:parent].unshift node.name
160
+ end
161
+
162
+ process_attributes_for node, options
163
+
164
+ options[:tags].include?(node.name) ? node : nil
165
+ else
166
+ bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
167
+ end
168
+ end
169
+
170
+ def process_attributes_for(node, options)
171
+ return unless node.attributes
172
+ node.attributes.keys.each do |attr_name|
173
+ value = node.attributes[attr_name].to_s
174
+
175
+ if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
176
+ node.attributes.delete(attr_name)
177
+ else
178
+ node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
179
+ end
180
+ end
181
+ end
182
+
183
+ def contains_bad_protocols?(attr_name, value)
184
+ uri_attributes.include?(attr_name) &&
185
+ (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i && !allowed_protocols.include?(value.split(protocol_separator).first.downcase.strip))
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,830 @@
1
+ #--
2
+ # Copyright (c) 2006 Assaf Arkin (http://labnotes.org)
3
+ # Under MIT and/or CC By license.
4
+ #++
5
+
6
+ module HTML
7
+
8
+ # Selects HTML elements using CSS 2 selectors.
9
+ #
10
+ # The +Selector+ class uses CSS selector expressions to match and select
11
+ # HTML elements.
12
+ #
13
+ # For example:
14
+ # selector = HTML::Selector.new "form.login[action=/login]"
15
+ # creates a new selector that matches any +form+ element with the class
16
+ # +login+ and an attribute +action+ with the value <tt>/login</tt>.
17
+ #
18
+ # === Matching Elements
19
+ #
20
+ # Use the #match method to determine if an element matches the selector.
21
+ #
22
+ # For simple selectors, the method returns an array with that element,
23
+ # or +nil+ if the element does not match. For complex selectors (see below)
24
+ # the method returns an array with all matched elements, of +nil+ if no
25
+ # match found.
26
+ #
27
+ # For example:
28
+ # if selector.match(element)
29
+ # puts "Element is a login form"
30
+ # end
31
+ #
32
+ # === Selecting Elements
33
+ #
34
+ # Use the #select method to select all matching elements starting with
35
+ # one element and going through all children in depth-first order.
36
+ #
37
+ # This method returns an array of all matching elements, an empty array
38
+ # if no match is found
39
+ #
40
+ # For example:
41
+ # selector = HTML::Selector.new "input[type=text]"
42
+ # matches = selector.select(element)
43
+ # matches.each do |match|
44
+ # puts "Found text field with name #{match.attributes['name']}"
45
+ # end
46
+ #
47
+ # === Expressions
48
+ #
49
+ # Selectors can match elements using any of the following criteria:
50
+ # * <tt>name</tt> -- Match an element based on its name (tag name).
51
+ # For example, <tt>p</tt> to match a paragraph. You can use <tt>*</tt>
52
+ # to match any element.
53
+ # * <tt>#</tt><tt>id</tt> -- Match an element based on its identifier (the
54
+ # <tt>id</tt> attribute). For example, <tt>#</tt><tt>page</tt>.
55
+ # * <tt>.class</tt> -- Match an element based on its class name, all
56
+ # class names if more than one specified.
57
+ # * <tt>[attr]</tt> -- Match an element that has the specified attribute.
58
+ # * <tt>[attr=value]</tt> -- Match an element that has the specified
59
+ # attribute and value. (More operators are supported see below)
60
+ # * <tt>:pseudo-class</tt> -- Match an element based on a pseudo class,
61
+ # such as <tt>:nth-child</tt> and <tt>:empty</tt>.
62
+ # * <tt>:not(expr)</tt> -- Match an element that does not match the
63
+ # negation expression.
64
+ #
65
+ # When using a combination of the above, the element name comes first
66
+ # followed by identifier, class names, attributes, pseudo classes and
67
+ # negation in any order. Do not separate these parts with spaces!
68
+ # Space separation is used for descendant selectors.
69
+ #
70
+ # For example:
71
+ # selector = HTML::Selector.new "form.login[action=/login]"
72
+ # The matched element must be of type +form+ and have the class +login+.
73
+ # It may have other classes, but the class +login+ is required to match.
74
+ # It must also have an attribute called +action+ with the value
75
+ # <tt>/login</tt>.
76
+ #
77
+ # This selector will match the following element:
78
+ # <form class="login form" method="post" action="/login">
79
+ # but will not match the element:
80
+ # <form method="post" action="/logout">
81
+ #
82
+ # === Attribute Values
83
+ #
84
+ # Several operators are supported for matching attributes:
85
+ # * <tt>name</tt> -- The element must have an attribute with that name.
86
+ # * <tt>name=value</tt> -- The element must have an attribute with that
87
+ # name and value.
88
+ # * <tt>name^=value</tt> -- The attribute value must start with the
89
+ # specified value.
90
+ # * <tt>name$=value</tt> -- The attribute value must end with the
91
+ # specified value.
92
+ # * <tt>name*=value</tt> -- The attribute value must contain the
93
+ # specified value.
94
+ # * <tt>name~=word</tt> -- The attribute value must contain the specified
95
+ # word (space separated).
96
+ # * <tt>name|=word</tt> -- The attribute value must start with specified
97
+ # word.
98
+ #
99
+ # For example, the following two selectors match the same element:
100
+ # #my_id
101
+ # [id=my_id]
102
+ # and so do the following two selectors:
103
+ # .my_class
104
+ # [class~=my_class]
105
+ #
106
+ # === Alternatives, siblings, children
107
+ #
108
+ # Complex selectors use a combination of expressions to match elements:
109
+ # * <tt>expr1 expr2</tt> -- Match any element against the second expression
110
+ # if it has some parent element that matches the first expression.
111
+ # * <tt>expr1 > expr2</tt> -- Match any element against the second expression
112
+ # if it is the child of an element that matches the first expression.
113
+ # * <tt>expr1 + expr2</tt> -- Match any element against the second expression
114
+ # if it immediately follows an element that matches the first expression.
115
+ # * <tt>expr1 ~ expr2</tt> -- Match any element against the second expression
116
+ # that comes after an element that matches the first expression.
117
+ # * <tt>expr1, expr2</tt> -- Match any element against the first expression,
118
+ # or against the second expression.
119
+ #
120
+ # Since children and sibling selectors may match more than one element given
121
+ # the first element, the #match method may return more than one match.
122
+ #
123
+ # === Pseudo classes
124
+ #
125
+ # Pseudo classes were introduced in CSS 3. They are most often used to select
126
+ # elements in a given position:
127
+ # * <tt>:root</tt> -- Match the element only if it is the root element
128
+ # (no parent element).
129
+ # * <tt>:empty</tt> -- Match the element only if it has no child elements,
130
+ # and no text content.
131
+ # * <tt>:content(string)</tt> -- Match the element only if it has <tt>string</tt>
132
+ # as its text content (ignoring leading and trailing whitespace).
133
+ # * <tt>:only-child</tt> -- Match the element if it is the only child (element)
134
+ # of its parent element.
135
+ # * <tt>:only-of-type</tt> -- Match the element if it is the only child (element)
136
+ # of its parent element and its type.
137
+ # * <tt>:first-child</tt> -- Match the element if it is the first child (element)
138
+ # of its parent element.
139
+ # * <tt>:first-of-type</tt> -- Match the element if it is the first child (element)
140
+ # of its parent element of its type.
141
+ # * <tt>:last-child</tt> -- Match the element if it is the last child (element)
142
+ # of its parent element.
143
+ # * <tt>:last-of-type</tt> -- Match the element if it is the last child (element)
144
+ # of its parent element of its type.
145
+ # * <tt>:nth-child(b)</tt> -- Match the element if it is the b-th child (element)
146
+ # of its parent element. The value <tt>b</tt> specifies its index, starting with 1.
147
+ # * <tt>:nth-child(an+b)</tt> -- Match the element if it is the b-th child (element)
148
+ # in each group of <tt>a</tt> child elements of its parent element.
149
+ # * <tt>:nth-child(-an+b)</tt> -- Match the element if it is the first child (element)
150
+ # in each group of <tt>a</tt> child elements, up to the first <tt>b</tt> child
151
+ # elements of its parent element.
152
+ # * <tt>:nth-child(odd)</tt> -- Match element in the odd position (i.e. first, third).
153
+ # Same as <tt>:nth-child(2n+1)</tt>.
154
+ # * <tt>:nth-child(even)</tt> -- Match element in the even position (i.e. second,
155
+ # fourth). Same as <tt>:nth-child(2n+2)</tt>.
156
+ # * <tt>:nth-of-type(..)</tt> -- As above, but only counts elements of its type.
157
+ # * <tt>:nth-last-child(..)</tt> -- As above, but counts from the last child.
158
+ # * <tt>:nth-last-of-type(..)</tt> -- As above, but counts from the last child and
159
+ # only elements of its type.
160
+ # * <tt>:not(selector)</tt> -- Match the element only if the element does not
161
+ # match the simple selector.
162
+ #
163
+ # As you can see, <tt>:nth-child</tt> pseudo class and its variant can get quite
164
+ # tricky and the CSS specification doesn't do a much better job explaining it.
165
+ # But after reading the examples and trying a few combinations, it's easy to
166
+ # figure out.
167
+ #
168
+ # For example:
169
+ # table tr:nth-child(odd)
170
+ # Selects every second row in the table starting with the first one.
171
+ #
172
+ # div p:nth-child(4)
173
+ # Selects the fourth paragraph in the +div+, but not if the +div+ contains
174
+ # other elements, since those are also counted.
175
+ #
176
+ # div p:nth-of-type(4)
177
+ # Selects the fourth paragraph in the +div+, counting only paragraphs, and
178
+ # ignoring all other elements.
179
+ #
180
+ # div p:nth-of-type(-n+4)
181
+ # Selects the first four paragraphs, ignoring all others.
182
+ #
183
+ # And you can always select an element that matches one set of rules but
184
+ # not another using <tt>:not</tt>. For example:
185
+ # p:not(.post)
186
+ # Matches all paragraphs that do not have the class <tt>.post</tt>.
187
+ #
188
+ # === Substitution Values
189
+ #
190
+ # You can use substitution with identifiers, class names and element values.
191
+ # A substitution takes the form of a question mark (<tt>?</tt>) and uses the
192
+ # next value in the argument list following the CSS expression.
193
+ #
194
+ # The substitution value may be a string or a regular expression. All other
195
+ # values are converted to strings.
196
+ #
197
+ # For example:
198
+ # selector = HTML::Selector.new "#?", /^\d+$/
199
+ # matches any element whose identifier consists of one or more digits.
200
+ #
201
+ # See http://www.w3.org/TR/css3-selectors/
202
+ class Selector
203
+
204
+
205
+ # An invalid selector.
206
+ class InvalidSelectorError < StandardError #:nodoc:
207
+ end
208
+
209
+
210
+ class << self
211
+
212
+ # :call-seq:
213
+ # Selector.for_class(cls) => selector
214
+ #
215
+ # Creates a new selector for the given class name.
216
+ def for_class(cls)
217
+ self.new([".?", cls])
218
+ end
219
+
220
+
221
+ # :call-seq:
222
+ # Selector.for_id(id) => selector
223
+ #
224
+ # Creates a new selector for the given id.
225
+ def for_id(id)
226
+ self.new(["#?", id])
227
+ end
228
+
229
+ end
230
+
231
+
232
+ # :call-seq:
233
+ # Selector.new(string, [values ...]) => selector
234
+ #
235
+ # Creates a new selector from a CSS 2 selector expression.
236
+ #
237
+ # The first argument is the selector expression. All other arguments
238
+ # are used for value substitution.
239
+ #
240
+ # Throws InvalidSelectorError is the selector expression is invalid.
241
+ def initialize(selector, *values)
242
+ raise ArgumentError, "CSS expression cannot be empty" if selector.empty?
243
+ @source = ""
244
+ values = values[0] if values.size == 1 && values[0].is_a?(Array)
245
+
246
+ # We need a copy to determine if we failed to parse, and also
247
+ # preserve the original pass by-ref statement.
248
+ statement = selector.strip.dup
249
+
250
+ # Create a simple selector, along with negation.
251
+ simple_selector(statement, values).each { |name, value| instance_variable_set("@#{name}", value) }
252
+
253
+ @alternates = []
254
+ @depends = nil
255
+
256
+ # Alternative selector.
257
+ if statement.sub!(/^\s*,\s*/, "")
258
+ second = Selector.new(statement, values)
259
+ @alternates << second
260
+ # If there are alternate selectors, we group them in the top selector.
261
+ if alternates = second.instance_variable_get(:@alternates)
262
+ second.instance_variable_set(:@alternates, [])
263
+ @alternates.concat alternates
264
+ end
265
+ @source << " , " << second.to_s
266
+ # Sibling selector: create a dependency into second selector that will
267
+ # match element immediately following this one.
268
+ elsif statement.sub!(/^\s*\+\s*/, "")
269
+ second = next_selector(statement, values)
270
+ @depends = lambda do |element, first|
271
+ if element = next_element(element)
272
+ second.match(element, first)
273
+ end
274
+ end
275
+ @source << " + " << second.to_s
276
+ # Adjacent selector: create a dependency into second selector that will
277
+ # match all elements following this one.
278
+ elsif statement.sub!(/^\s*~\s*/, "")
279
+ second = next_selector(statement, values)
280
+ @depends = lambda do |element, first|
281
+ matches = []
282
+ while element = next_element(element)
283
+ if subset = second.match(element, first)
284
+ if first && !subset.empty?
285
+ matches << subset.first
286
+ break
287
+ else
288
+ matches.concat subset
289
+ end
290
+ end
291
+ end
292
+ matches.empty? ? nil : matches
293
+ end
294
+ @source << " ~ " << second.to_s
295
+ # Child selector: create a dependency into second selector that will
296
+ # match a child element of this one.
297
+ elsif statement.sub!(/^\s*>\s*/, "")
298
+ second = next_selector(statement, values)
299
+ @depends = lambda do |element, first|
300
+ matches = []
301
+ element.children.each do |child|
302
+ if child.tag? && subset = second.match(child, first)
303
+ if first && !subset.empty?
304
+ matches << subset.first
305
+ break
306
+ else
307
+ matches.concat subset
308
+ end
309
+ end
310
+ end
311
+ matches.empty? ? nil : matches
312
+ end
313
+ @source << " > " << second.to_s
314
+ # Descendant selector: create a dependency into second selector that
315
+ # will match all descendant elements of this one. Note,
316
+ elsif statement =~ /^\s+\S+/ && statement != selector
317
+ second = next_selector(statement, values)
318
+ @depends = lambda do |element, first|
319
+ matches = []
320
+ stack = element.children.reverse
321
+ while node = stack.pop
322
+ next unless node.tag?
323
+ if subset = second.match(node, first)
324
+ if first && !subset.empty?
325
+ matches << subset.first
326
+ break
327
+ else
328
+ matches.concat subset
329
+ end
330
+ elsif children = node.children
331
+ stack.concat children.reverse
332
+ end
333
+ end
334
+ matches.empty? ? nil : matches
335
+ end
336
+ @source << " " << second.to_s
337
+ else
338
+ # The last selector is where we check that we parsed
339
+ # all the parts.
340
+ unless statement.empty? || statement.strip.empty?
341
+ raise ArgumentError, "Invalid selector: #{statement}"
342
+ end
343
+ end
344
+ end
345
+
346
+
347
+ # :call-seq:
348
+ # match(element, first?) => array or nil
349
+ #
350
+ # Matches an element against the selector.
351
+ #
352
+ # For a simple selector this method returns an array with the
353
+ # element if the element matches, nil otherwise.
354
+ #
355
+ # For a complex selector (sibling and descendant) this method
356
+ # returns an array with all matching elements, nil if no match is
357
+ # found.
358
+ #
359
+ # Use +first_only=true+ if you are only interested in the first element.
360
+ #
361
+ # For example:
362
+ # if selector.match(element)
363
+ # puts "Element is a login form"
364
+ # end
365
+ def match(element, first_only = false)
366
+ # Match element if no element name or element name same as element name
367
+ if matched = (!@tag_name || @tag_name == element.name)
368
+ # No match if one of the attribute matches failed
369
+ for attr in @attributes
370
+ if element.attributes[attr[0]] !~ attr[1]
371
+ matched = false
372
+ break
373
+ end
374
+ end
375
+ end
376
+
377
+ # Pseudo class matches (nth-child, empty, etc).
378
+ if matched
379
+ for pseudo in @pseudo
380
+ unless pseudo.call(element)
381
+ matched = false
382
+ break
383
+ end
384
+ end
385
+ end
386
+
387
+ # Negation. Same rules as above, but we fail if a match is made.
388
+ if matched && @negation
389
+ for negation in @negation
390
+ if negation[:tag_name] == element.name
391
+ matched = false
392
+ else
393
+ for attr in negation[:attributes]
394
+ if element.attributes[attr[0]] =~ attr[1]
395
+ matched = false
396
+ break
397
+ end
398
+ end
399
+ end
400
+ if matched
401
+ for pseudo in negation[:pseudo]
402
+ if pseudo.call(element)
403
+ matched = false
404
+ break
405
+ end
406
+ end
407
+ end
408
+ break unless matched
409
+ end
410
+ end
411
+
412
+ # If element matched but depends on another element (child,
413
+ # sibling, etc), apply the dependent matches instead.
414
+ if matched && @depends
415
+ matches = @depends.call(element, first_only)
416
+ else
417
+ matches = matched ? [element] : nil
418
+ end
419
+
420
+ # If this selector is part of the group, try all the alternative
421
+ # selectors (unless first_only).
422
+ if !first_only || !matches
423
+ @alternates.each do |alternate|
424
+ break if matches && first_only
425
+ if subset = alternate.match(element, first_only)
426
+ if matches
427
+ matches.concat subset
428
+ else
429
+ matches = subset
430
+ end
431
+ end
432
+ end
433
+ end
434
+
435
+ matches
436
+ end
437
+
438
+
439
+ # :call-seq:
440
+ # select(root) => array
441
+ #
442
+ # Selects and returns an array with all matching elements, beginning
443
+ # with one node and traversing through all children depth-first.
444
+ # Returns an empty array if no match is found.
445
+ #
446
+ # The root node may be any element in the document, or the document
447
+ # itself.
448
+ #
449
+ # For example:
450
+ # selector = HTML::Selector.new "input[type=text]"
451
+ # matches = selector.select(element)
452
+ # matches.each do |match|
453
+ # puts "Found text field with name #{match.attributes['name']}"
454
+ # end
455
+ def select(root)
456
+ matches = []
457
+ stack = [root]
458
+ while node = stack.pop
459
+ if node.tag? && subset = match(node, false)
460
+ subset.each do |match|
461
+ matches << match unless matches.any? { |item| item.equal?(match) }
462
+ end
463
+ elsif children = node.children
464
+ stack.concat children.reverse
465
+ end
466
+ end
467
+ matches
468
+ end
469
+
470
+
471
+ # Similar to #select but returns the first matching element. Returns +nil+
472
+ # if no element matches the selector.
473
+ def select_first(root)
474
+ stack = [root]
475
+ while node = stack.pop
476
+ if node.tag? && subset = match(node, true)
477
+ return subset.first if !subset.empty?
478
+ elsif children = node.children
479
+ stack.concat children.reverse
480
+ end
481
+ end
482
+ nil
483
+ end
484
+
485
+
486
+ def to_s #:nodoc:
487
+ @source
488
+ end
489
+
490
+
491
+ # Returns the next element after this one. Skips sibling text nodes.
492
+ #
493
+ # With the +name+ argument, returns the next element with that name,
494
+ # skipping other sibling elements.
495
+ def next_element(element, name = nil)
496
+ if siblings = element.parent.children
497
+ found = false
498
+ siblings.each do |node|
499
+ if node.equal?(element)
500
+ found = true
501
+ elsif found && node.tag?
502
+ return node if (name.nil? || node.name == name)
503
+ end
504
+ end
505
+ end
506
+ nil
507
+ end
508
+
509
+
510
+ protected
511
+
512
+
513
+ # Creates a simple selector given the statement and array of
514
+ # substitution values.
515
+ #
516
+ # Returns a hash with the values +tag_name+, +attributes+,
517
+ # +pseudo+ (classes) and +negation+.
518
+ #
519
+ # Called the first time with +can_negate+ true to allow
520
+ # negation. Called a second time with false since negation
521
+ # cannot be negated.
522
+ def simple_selector(statement, values, can_negate = true)
523
+ tag_name = nil
524
+ attributes = []
525
+ pseudo = []
526
+ negation = []
527
+
528
+ # Element name. (Note that in negation, this can come at
529
+ # any order, but for simplicity we allow if only first).
530
+ statement.sub!(/^(\*|[[:alpha:]][\w\-]*)/) do |match|
531
+ match.strip!
532
+ tag_name = match.downcase unless match == "*"
533
+ @source << match
534
+ "" # Remove
535
+ end
536
+
537
+ # Get identifier, class, attribute name, pseudo or negation.
538
+ while true
539
+ # Element identifier.
540
+ next if statement.sub!(/^#(\?|[\w\-]+)/) do
541
+ id = $1
542
+ if id == "?"
543
+ id = values.shift
544
+ end
545
+ @source << "##{id}"
546
+ id = Regexp.new("^#{Regexp.escape(id.to_s)}$") unless id.is_a?(Regexp)
547
+ attributes << ["id", id]
548
+ "" # Remove
549
+ end
550
+
551
+ # Class name.
552
+ next if statement.sub!(/^\.([\w\-]+)/) do
553
+ class_name = $1
554
+ @source << ".#{class_name}"
555
+ class_name = Regexp.new("(^|\s)#{Regexp.escape(class_name)}($|\s)") unless class_name.is_a?(Regexp)
556
+ attributes << ["class", class_name]
557
+ "" # Remove
558
+ end
559
+
560
+ # Attribute value.
561
+ next if statement.sub!(/^\[\s*([[:alpha:]][\w\-:]*)\s*((?:[~|^$*])?=)?\s*('[^']*'|"[^*]"|[^\]]*)\s*\]/) do
562
+ name, equality, value = $1, $2, $3
563
+ if value == "?"
564
+ value = values.shift
565
+ else
566
+ # Handle single and double quotes.
567
+ value.strip!
568
+ if (value[0] == ?" || value[0] == ?') && value[0] == value[-1]
569
+ value = value[1..-2]
570
+ end
571
+ end
572
+ @source << "[#{name}#{equality}'#{value}']"
573
+ attributes << [name.downcase.strip, attribute_match(equality, value)]
574
+ "" # Remove
575
+ end
576
+
577
+ # Root element only.
578
+ next if statement.sub!(/^:root/) do
579
+ pseudo << lambda do |element|
580
+ element.parent.nil? || !element.parent.tag?
581
+ end
582
+ @source << ":root"
583
+ "" # Remove
584
+ end
585
+
586
+ # Nth-child including last and of-type.
587
+ next if statement.sub!(/^:nth-(last-)?(child|of-type)\((odd|even|(\d+|\?)|(-?\d*|\?)?n([+\-]\d+|\?)?)\)/) do |match|
588
+ reverse = $1 == "last-"
589
+ of_type = $2 == "of-type"
590
+ @source << ":nth-#{$1}#{$2}("
591
+ case $3
592
+ when "odd"
593
+ pseudo << nth_child(2, 1, of_type, reverse)
594
+ @source << "odd)"
595
+ when "even"
596
+ pseudo << nth_child(2, 2, of_type, reverse)
597
+ @source << "even)"
598
+ when /^(\d+|\?)$/ # b only
599
+ b = ($1 == "?" ? values.shift : $1).to_i
600
+ pseudo << nth_child(0, b, of_type, reverse)
601
+ @source << "#{b})"
602
+ when /^(-?\d*|\?)?n([+\-]\d+|\?)?$/
603
+ a = ($1 == "?" ? values.shift :
604
+ $1 == "" ? 1 : $1 == "-" ? -1 : $1).to_i
605
+ b = ($2 == "?" ? values.shift : $2).to_i
606
+ pseudo << nth_child(a, b, of_type, reverse)
607
+ @source << (b >= 0 ? "#{a}n+#{b})" : "#{a}n#{b})")
608
+ else
609
+ raise ArgumentError, "Invalid nth-child #{match}"
610
+ end
611
+ "" # Remove
612
+ end
613
+ # First/last child (of type).
614
+ next if statement.sub!(/^:(first|last)-(child|of-type)/) do
615
+ reverse = $1 == "last"
616
+ of_type = $2 == "of-type"
617
+ pseudo << nth_child(0, 1, of_type, reverse)
618
+ @source << ":#{$1}-#{$2}"
619
+ "" # Remove
620
+ end
621
+ # Only child (of type).
622
+ next if statement.sub!(/^:only-(child|of-type)/) do
623
+ of_type = $1 == "of-type"
624
+ pseudo << only_child(of_type)
625
+ @source << ":only-#{$1}"
626
+ "" # Remove
627
+ end
628
+
629
+ # Empty: no child elements or meaningful content (whitespaces
630
+ # are ignored).
631
+ next if statement.sub!(/^:empty/) do
632
+ pseudo << lambda do |element|
633
+ empty = true
634
+ for child in element.children
635
+ if child.tag? || !child.content.strip.empty?
636
+ empty = false
637
+ break
638
+ end
639
+ end
640
+ empty
641
+ end
642
+ @source << ":empty"
643
+ "" # Remove
644
+ end
645
+ # Content: match the text content of the element, stripping
646
+ # leading and trailing spaces.
647
+ next if statement.sub!(/^:content\(\s*(\?|'[^']*'|"[^"]*"|[^)]*)\s*\)/) do
648
+ content = $1
649
+ if content == "?"
650
+ content = values.shift
651
+ elsif (content[0] == ?" || content[0] == ?') && content[0] == content[-1]
652
+ content = content[1..-2]
653
+ end
654
+ @source << ":content('#{content}')"
655
+ content = Regexp.new("^#{Regexp.escape(content.to_s)}$") unless content.is_a?(Regexp)
656
+ pseudo << lambda do |element|
657
+ text = ""
658
+ for child in element.children
659
+ unless child.tag?
660
+ text << child.content
661
+ end
662
+ end
663
+ text.strip =~ content
664
+ end
665
+ "" # Remove
666
+ end
667
+
668
+ # Negation. Create another simple selector to handle it.
669
+ if statement.sub!(/^:not\(\s*/, "")
670
+ raise ArgumentError, "Double negatives are not missing feature" unless can_negate
671
+ @source << ":not("
672
+ negation << simple_selector(statement, values, false)
673
+ raise ArgumentError, "Negation not closed" unless statement.sub!(/^\s*\)/, "")
674
+ @source << ")"
675
+ next
676
+ end
677
+
678
+ # No match: moving on.
679
+ break
680
+ end
681
+
682
+ # Return hash. The keys are mapped to instance variables.
683
+ {:tag_name=>tag_name, :attributes=>attributes, :pseudo=>pseudo, :negation=>negation}
684
+ end
685
+
686
+
687
+ # Create a regular expression to match an attribute value based
688
+ # on the equality operator (=, ^=, |=, etc).
689
+ def attribute_match(equality, value)
690
+ regexp = value.is_a?(Regexp) ? value : Regexp.escape(value.to_s)
691
+ case equality
692
+ when "=" then
693
+ # Match the attribute value in full
694
+ Regexp.new("^#{regexp}$")
695
+ when "~=" then
696
+ # Match a space-separated word within the attribute value
697
+ Regexp.new("(^|\s)#{regexp}($|\s)")
698
+ when "^="
699
+ # Match the beginning of the attribute value
700
+ Regexp.new("^#{regexp}")
701
+ when "$="
702
+ # Match the end of the attribute value
703
+ Regexp.new("#{regexp}$")
704
+ when "*="
705
+ # Match substring of the attribute value
706
+ regexp.is_a?(Regexp) ? regexp : Regexp.new(regexp)
707
+ when "|=" then
708
+ # Match the first space-separated item of the attribute value
709
+ Regexp.new("^#{regexp}($|\s)")
710
+ else
711
+ raise InvalidSelectorError, "Invalid operation/value" unless value.empty?
712
+ # Match all attributes values (existence check)
713
+ //
714
+ end
715
+ end
716
+
717
+
718
+ # Returns a lambda that can match an element against the nth-child
719
+ # pseudo class, given the following arguments:
720
+ # * +a+ -- Value of a part.
721
+ # * +b+ -- Value of b part.
722
+ # * +of_type+ -- True to test only elements of this type (of-type).
723
+ # * +reverse+ -- True to count in reverse order (last-).
724
+ def nth_child(a, b, of_type, reverse)
725
+ # a = 0 means select at index b, if b = 0 nothing selected
726
+ return lambda { |element| false } if a == 0 && b == 0
727
+ # a < 0 and b < 0 will never match against an index
728
+ return lambda { |element| false } if a < 0 && b < 0
729
+ b = a + b + 1 if b < 0 # b < 0 just picks last element from each group
730
+ b -= 1 unless b == 0 # b == 0 is same as b == 1, otherwise zero based
731
+ lambda do |element|
732
+ # Element must be inside parent element.
733
+ return false unless element.parent && element.parent.tag?
734
+ index = 0
735
+ # Get siblings, reverse if counting from last.
736
+ siblings = element.parent.children
737
+ siblings = siblings.reverse if reverse
738
+ # Match element name if of-type, otherwise ignore name.
739
+ name = of_type ? element.name : nil
740
+ found = false
741
+ for child in siblings
742
+ # Skip text nodes/comments.
743
+ if child.tag? && (name == nil || child.name == name)
744
+ if a == 0
745
+ # Shortcut when a == 0 no need to go past count
746
+ if index == b
747
+ found = child.equal?(element)
748
+ break
749
+ end
750
+ elsif a < 0
751
+ # Only look for first b elements
752
+ break if index > b
753
+ if child.equal?(element)
754
+ found = (index % a) == 0
755
+ break
756
+ end
757
+ else
758
+ # Otherwise, break if child found and count == an+b
759
+ if child.equal?(element)
760
+ found = (index % a) == b
761
+ break
762
+ end
763
+ end
764
+ index += 1
765
+ end
766
+ end
767
+ found
768
+ end
769
+ end
770
+
771
+
772
+ # Creates a only child lambda. Pass +of-type+ to only look at
773
+ # elements of its type.
774
+ def only_child(of_type)
775
+ lambda do |element|
776
+ # Element must be inside parent element.
777
+ return false unless element.parent && element.parent.tag?
778
+ name = of_type ? element.name : nil
779
+ other = false
780
+ for child in element.parent.children
781
+ # Skip text nodes/comments.
782
+ if child.tag? && (name == nil || child.name == name)
783
+ unless child.equal?(element)
784
+ other = true
785
+ break
786
+ end
787
+ end
788
+ end
789
+ !other
790
+ end
791
+ end
792
+
793
+
794
+ # Called to create a dependent selector (sibling, descendant, etc).
795
+ # Passes the remainder of the statement that will be reduced to zero
796
+ # eventually, and array of substitution values.
797
+ #
798
+ # This method is called from four places, so it helps to put it here
799
+ # for reuse. The only logic deals with the need to detect comma
800
+ # separators (alternate) and apply them to the selector group of the
801
+ # top selector.
802
+ def next_selector(statement, values)
803
+ second = Selector.new(statement, values)
804
+ # If there are alternate selectors, we group them in the top selector.
805
+ if alternates = second.instance_variable_get(:@alternates)
806
+ second.instance_variable_set(:@alternates, [])
807
+ @alternates.concat alternates
808
+ end
809
+ second
810
+ end
811
+
812
+ end
813
+
814
+
815
+ # See HTML::Selector.new
816
+ def self.selector(statement, *values)
817
+ Selector.new(statement, *values)
818
+ end
819
+
820
+
821
+ class Tag
822
+
823
+ def select(selector, *values)
824
+ selector = HTML::Selector.new(selector, values)
825
+ selector.select(self)
826
+ end
827
+
828
+ end
829
+
830
+ end