assaf-scrapi 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,86 @@
1
+ module HTML
2
+
3
+ class Node
4
+
5
+ # Returns the next sibling node.
6
+ def next_sibling()
7
+ if siblings = parent.children
8
+ siblings.each_with_index do |node, i|
9
+ return siblings[i + 1] if node.equal?(self)
10
+ end
11
+ end
12
+ nil
13
+ end
14
+
15
+
16
+ # Returns the previous sibling node.
17
+ def previous_sibling()
18
+ if siblings = parent.children
19
+ siblings.each_with_index do |node, i|
20
+ return siblings[i - 1] if node.equal?(self)
21
+ end
22
+ end
23
+ nil
24
+ end
25
+
26
+
27
+ # Return the next element after this one. Skips sibling text nodes.
28
+ #
29
+ # With the +name+ argument, returns the next element with that name,
30
+ # skipping other sibling elements.
31
+ def next_element(name = nil)
32
+ if siblings = parent.children
33
+ found = false
34
+ siblings.each do |node|
35
+ if node.equal?(self)
36
+ found = true
37
+ elsif found && node.tag?
38
+ return node if (name.nil? || node.name == name)
39
+ end
40
+ end
41
+ end
42
+ nil
43
+ end
44
+
45
+
46
+ # Return the previous element before this one. Skips sibling text
47
+ # nodes.
48
+ #
49
+ # Using the +name+ argument, returns the previous element with
50
+ # that name, skipping other sibling elements.
51
+ def previous_element(name = nil)
52
+ if siblings = parent.children
53
+ found = nil
54
+ siblings.each do |node|
55
+ return found if node.equal?(self)
56
+ found = node if node.tag? && (name.nil? || node.name == name)
57
+ end
58
+ end
59
+ nil
60
+ end
61
+
62
+
63
+ # Detach this node from its parent.
64
+ def detach()
65
+ if @parent
66
+ @parent.children.delete_if { |child| child.equal?(self) }
67
+ @parent = nil
68
+ end
69
+ self
70
+ end
71
+
72
+
73
+ # Process each node beginning with the current node.
74
+ def each(value = nil, &block)
75
+ yield self, value
76
+ if @children
77
+ @children.each do |child|
78
+ child.each value, &block
79
+ end
80
+ end
81
+ value
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,825 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ module HTML
9
+
10
+
11
+ # Selects HTML elements using CSS 2 selectors.
12
+ #
13
+ # The +Selector+ class uses CSS selector expressions to match and select
14
+ # HTML elements.
15
+ #
16
+ # For example:
17
+ # selector = HTML::Selector.new "form.login[action=/login]"
18
+ # creates a new selector that matches any +form+ element with the class
19
+ # +login+ and an attribute +action+ with the value <tt>/login</tt>.
20
+ #
21
+ # === Matching Elements
22
+ #
23
+ # Use the #match method to determine if an element matches the selector.
24
+ #
25
+ # For simple selectors, the method returns an array with that element,
26
+ # or +nil+ if the element does not match. For complex selectors (see below)
27
+ # the method returns an array with all matched elements, of +nil+ if no
28
+ # match found.
29
+ #
30
+ # For example:
31
+ # if selector.match(element)
32
+ # puts "Element is a login form"
33
+ # end
34
+ #
35
+ # === Selecting Elements
36
+ #
37
+ # Use the #select method to select all matching elements starting with
38
+ # one element and going through all children in depth-first order.
39
+ #
40
+ # This method returns an array of all matching elements, an empty array
41
+ # if no match is found
42
+ #
43
+ # For example:
44
+ # selector = HTML::Selector.new "input[type=text]"
45
+ # matches = selector.select(element)
46
+ # matches.each do |match|
47
+ # puts "Found text field with name #{match.attributes['name']}"
48
+ # end
49
+ #
50
+ # === Expressions
51
+ #
52
+ # Selectors can match elements using any of the following criteria:
53
+ # * <tt>name</tt> -- Match an element based on its name (tag name).
54
+ # For example, <tt>p</tt> to match a paragraph. You can use <tt>*</tt>
55
+ # to match any element.
56
+ # * <tt>#</tt><tt>id</tt> -- Match an element based on its identifier (the
57
+ # <tt>id</tt> attribute). For example, <tt>#</tt><tt>page</tt>.
58
+ # * <tt>.class</tt> -- Match an element based on its class name, all
59
+ # class names if more than one specified.
60
+ # * <tt>[attr]</tt> -- Match an element that has the specified attribute.
61
+ # * <tt>[attr=value]</tt> -- Match an element that has the specified
62
+ # attribute and value. (More operators are supported see below)
63
+ # * <tt>:pseudo-class</tt> -- Match an element based on a pseudo class,
64
+ # such as <tt>:nth-child</tt> and <tt>:empty</tt>.
65
+ # * <tt>:not(expr)</tt> -- Match an element that does not match the
66
+ # negation expression.
67
+ #
68
+ # When using a combination of the above, the element name comes first
69
+ # followed by identifier, class names, attributes, pseudo classes and
70
+ # negation in any order. Do not seprate these parts with spaces!
71
+ # Space separation is used for descendant selectors.
72
+ #
73
+ # For example:
74
+ # selector = HTML::Selector.new "form.login[action=/login]"
75
+ # The matched element must be of type +form+ and have the class +login+.
76
+ # It may have other classes, but the class +login+ is required to match.
77
+ # It must also have an attribute called +action+ with the value
78
+ # <tt>/login</tt>.
79
+ #
80
+ # This selector will match the following element:
81
+ # <form class="login form" method="post" action="/login">
82
+ # but will not match the element:
83
+ # <form method="post" action="/logout">
84
+ #
85
+ # === Attribute Values
86
+ #
87
+ # Several operators are supported for matching attributes:
88
+ # * <tt>name</tt> -- The element must have an attribute with that name.
89
+ # * <tt>name=value</tt> -- The element must have an attribute with that
90
+ # name and value.
91
+ # * <tt>name^=value</tt> -- The attribute value must start with the
92
+ # specified value.
93
+ # * <tt>name$=value</tt> -- The attribute value must end with the
94
+ # specified value.
95
+ # * <tt>name*=value</tt> -- The attribute value must contain the
96
+ # specified value.
97
+ # * <tt>name~=word</tt> -- The attribute value must contain the specified
98
+ # word (space separated).
99
+ # * <tt>name|=word</tt> -- The attribute value must start with specified
100
+ # word.
101
+ #
102
+ # For example, the following two selectors match the same element:
103
+ # #my_id
104
+ # [id=my_id]
105
+ # and so do the following two selectors:
106
+ # .my_class
107
+ # [class~=my_class]
108
+ #
109
+ # === Alternatives, siblings, children
110
+ #
111
+ # Complex selectors use a combination of expressions to match elements:
112
+ # * <tt>expr1 expr2</tt> -- Match any element against the second expression
113
+ # if it has some parent element that matches the first expression.
114
+ # * <tt>expr1 > expr2</tt> -- Match any element against the second expression
115
+ # if it is the child of an element that matches the first expression.
116
+ # * <tt>expr1 + expr2</tt> -- Match any element against the second expression
117
+ # if it immediately follows an element that matches the first expression.
118
+ # * <tt>expr1 ~ expr2</tt> -- Match any element against the second expression
119
+ # that comes after an element that matches the first expression.
120
+ # * <tt>expr1, expr2</tt> -- Match any element against the first expression,
121
+ # or against the second expression.
122
+ #
123
+ # Since children and sibling selectors may match more than one element given
124
+ # the first element, the #match method may return more than one match.
125
+ #
126
+ # === Pseudo classes
127
+ #
128
+ # Pseudo classes were introduced in CSS 3. They are most often used to select
129
+ # elements in a given position:
130
+ # * <tt>:root</tt> -- Match the element only if it is the root element
131
+ # (no parent element).
132
+ # * <tt>:empty</tt> -- Match the element only if it has no child elements,
133
+ # and no text content.
134
+ # * <tt>:only-child</tt> -- Match the element if it is the only child (element)
135
+ # of its parent element.
136
+ # * <tt>:only-of-type</tt> -- Match the element if it is the only child (element)
137
+ # of its parent element and its type.
138
+ # * <tt>:first-child</tt> -- Match the element if it is the first child (element)
139
+ # of its parent element.
140
+ # * <tt>:first-of-type</tt> -- Match the element if it is the first child (element)
141
+ # of its parent element of its type.
142
+ # * <tt>:last-child</tt> -- Match the element if it is the last child (element)
143
+ # of its parent element.
144
+ # * <tt>:last-of-type</tt> -- Match the element if it is the last child (element)
145
+ # of its parent element of its type.
146
+ # * <tt>:nth-child(b)</tt> -- Match the element if it is the b-th child (element)
147
+ # of its parent element. The value <tt>b</tt> specifies its index, starting with 1.
148
+ # * <tt>:nth-child(an+b)</tt> -- Match the element if it is the b-th child (element)
149
+ # in each group of <tt>a</tt> child elements of its parent element.
150
+ # * <tt>:nth-child(-an+b)</tt> -- Match the element if it is the first child (element)
151
+ # in each group of <tt>a</tt> child elements, up to the first <tt>b</tt> child
152
+ # elements of its parent element.
153
+ # * <tt>:nth-child(odd)</tt> -- Match element in the odd position (i.e. first, third).
154
+ # Same as <tt>:nth-child(2n+1)</tt>.
155
+ # * <tt>:nth-child(even)</tt> -- Match element in the even position (i.e. second,
156
+ # fourth). Same as <tt>:nth-child(2n+2)</tt>.
157
+ # * <tt>:nth-of-type(..)</tt> -- As above, but only counts elements of its type.
158
+ # * <tt>:nth-last-child(..)</tt> -- As above, but counts from the last child.
159
+ # * <tt>:nth-last-of-type(..)</tt> -- As above, but counts from the last child and
160
+ # only elements of its type.
161
+ # * <tt>:not(selector)</tt> -- Match the element only if the element does not
162
+ # match the simple selector.
163
+ #
164
+ # As you can see, <tt>:nth-child<tt> pseudo class and its varient can get quite
165
+ # tricky and the CSS specification doesn't do a much better job explaining it.
166
+ # But after reading the examples and trying a few combinations, it's easy to
167
+ # figure out.
168
+ #
169
+ # For example:
170
+ # table tr:nth-child(odd)
171
+ # Selects every second row in the table starting with the first one.
172
+ #
173
+ # div p:nth-child(4)
174
+ # Selects the fourth paragraph in the +div+, but not if the +div+ contains
175
+ # other elements, since those are also counted.
176
+ #
177
+ # div p:nth-of-type(4)
178
+ # Selects the fourth paragraph in the +div+, counting only paragraphs, and
179
+ # ignoring all other elements.
180
+ #
181
+ # div p:nth-of-type(-n+4)
182
+ # Selects the first four paragraphs, ignoring all others.
183
+ #
184
+ # And you can always select an element that matches one set of rules but
185
+ # not another using <tt>:not</tt>. For example:
186
+ # p:not(.post)
187
+ # Matches all paragraphs that do not have the class <tt>.post</tt>.
188
+ #
189
+ # === Substitution Values
190
+ #
191
+ # You can use substitution with identifiers, class names and element values.
192
+ # A substitution takes the form of a question mark (<tt>?</tt>) and uses the
193
+ # next value in the argument list following the CSS expression.
194
+ #
195
+ # The substitution value may be a string or a regular expression. All other
196
+ # values are converted to strings.
197
+ #
198
+ # For example:
199
+ # selector = HTML::Selector.new "#?", /^\d+$/
200
+ # matches any element whose identifier consists of one or more digits.
201
+ #
202
+ # See http://www.w3.org/TR/css3-selectors/
203
+ class Selector
204
+
205
+
206
+ # An invalid selector.
207
+ class InvalidSelectorError < StandardError ; end
208
+
209
+
210
+ class << self
211
+
212
+ # :call-seq:
213
+ # Selector.for_class(cls) => selector
214
+ #
215
+ # Creates a new selector for the given class name.
216
+ def for_class(cls)
217
+ self.new([".?", cls])
218
+ end
219
+
220
+
221
+ # :call-seq:
222
+ # Selector.for_id(id) => selector
223
+ #
224
+ # Creates a new selector for the given id.
225
+ def for_id(id)
226
+ self.new(["#?", id])
227
+ end
228
+
229
+ end
230
+
231
+
232
+ # :call-seq:
233
+ # Selector.new(string, [values ...]) => selector
234
+ #
235
+ # Creates a new selector from a CSS 2 selector expression.
236
+ #
237
+ # The first argument is the selector expression. All other arguments
238
+ # are used for value substitution.
239
+ #
240
+ # Throws InvalidSelectorError is the selector expression is invalid.
241
+ def initialize(selector, *values)
242
+ raise ArgumentError, "CSS expression cannot be empty" if selector.empty?
243
+ @source = ""
244
+ values = values[0] if values.size == 1 and values[0].is_a?(Array)
245
+ # We need a copy to determine if we failed to parse, and also
246
+ # preserve the original pass by-ref statement.
247
+ statement = selector.strip.dup
248
+ # Create a simple selector, along with negation.
249
+ simple_selector(statement, values).each { |name, value| instance_variable_set("@#{name}", value) }
250
+
251
+ # Alternative selector.
252
+ if statement.sub!(/^\s*,\s*/, "")
253
+ second = Selector.new(statement, values)
254
+ (@alternates ||= []) << second
255
+ # If there are alternate selectors, we group them in the top selector.
256
+ if alternates = second.instance_variable_get(:@alternates)
257
+ second.instance_variable_set(:@alternates, nil)
258
+ @alternates.concat alternates
259
+ end
260
+ @source << " , " << second.to_s
261
+ # Sibling selector: create a dependency into second selector that will
262
+ # match element immediately following this one.
263
+ elsif statement.sub!(/^\s*\+\s*/, "")
264
+ second = next_selector(statement, values)
265
+ @depends = lambda do |element, first|
266
+ if element = next_element(element)
267
+ second.match(element, first)
268
+ end
269
+ end
270
+ @source << " + " << second.to_s
271
+ # Adjacent selector: create a dependency into second selector that will
272
+ # match all elements following this one.
273
+ elsif statement.sub!(/^\s*~\s*/, "")
274
+ second = next_selector(statement, values)
275
+ @depends = lambda do |element, first|
276
+ matches = []
277
+ while element = next_element(element)
278
+ if subset = second.match(element, first)
279
+ if first && !subset.empty?
280
+ matches << subset.first
281
+ break
282
+ else
283
+ matches.concat subset
284
+ end
285
+ end
286
+ end
287
+ matches.empty? ? nil : matches
288
+ end
289
+ @source << " ~ " << second.to_s
290
+ # Child selector: create a dependency into second selector that will
291
+ # match a child element of this one.
292
+ elsif statement.sub!(/^\s*>\s*/, "")
293
+ second = next_selector(statement, values)
294
+ @depends = lambda do |element, first|
295
+ matches = []
296
+ element.children.each do |child|
297
+ if child.tag? and subset = second.match(child, first)
298
+ if first && !subset.empty?
299
+ matches << subset.first
300
+ break
301
+ else
302
+ matches.concat subset
303
+ end
304
+ end
305
+ end
306
+ matches.empty? ? nil : matches
307
+ end
308
+ @source << " > " << second.to_s
309
+ # Descendant selector: create a dependency into second selector that
310
+ # will match all descendant elements of this one. Note,
311
+ elsif statement =~ /^\s+\S+/ and statement != selector
312
+ second = next_selector(statement, values)
313
+ @depends = lambda do |element, first|
314
+ matches = []
315
+ stack = element.children.reverse
316
+ while node = stack.pop
317
+ next unless node.tag?
318
+ if subset = second.match(node, first)
319
+ if first && !subset.empty?
320
+ matches << subset.first
321
+ break
322
+ else
323
+ matches.concat subset
324
+ end
325
+ elsif children = node.children
326
+ stack.concat children.reverse
327
+ end
328
+ end
329
+ matches.empty? ? nil : matches
330
+ end
331
+ @source << " " << second.to_s
332
+ else
333
+ # The last selector is where we check that we parsed
334
+ # all the parts.
335
+ unless statement.empty? or statement.strip.empty?
336
+ raise ArgumentError, "Invalid selector: #{statement}"
337
+ end
338
+ end
339
+ end
340
+
341
+
342
+ # :call-seq:
343
+ # match(element, first?) => array or nil
344
+ #
345
+ # Matches an element against the selector.
346
+ #
347
+ # For a simple selector this method returns an array with the
348
+ # element if the element matches, nil otherwise.
349
+ #
350
+ # For a complex selector (sibling and descendant) this method
351
+ # returns an array with all matching elements, nil if no match is
352
+ # found.
353
+ #
354
+ # Use +first_only=true+ if you are only interested in the first element.
355
+ #
356
+ # For example:
357
+ # if selector.match(element)
358
+ # puts "Element is a login form"
359
+ # end
360
+ def match(element, first_only = false)
361
+ # Match element if no element name or element name same as element name
362
+ if matched = (!@tag_name or @tag_name == element.name)
363
+ # No match if one of the attribute matches failed
364
+ for attr in @attributes
365
+ if element.attributes[attr[0]] !~ attr[1]
366
+ matched = false
367
+ break
368
+ end
369
+ end
370
+ end
371
+
372
+ # Pseudo class matches (nth-child, empty, etc).
373
+ if matched
374
+ for pseudo in @pseudo
375
+ unless pseudo.call(element)
376
+ matched = false
377
+ break
378
+ end
379
+ end
380
+ end
381
+
382
+ # Negation. Same rules as above, but we fail if a match is made.
383
+ if matched and @negation
384
+ for negation in @negation
385
+ if negation[:tag_name] == element.name
386
+ matched = false
387
+ else
388
+ for attr in negation[:attributes]
389
+ if element.attributes[attr[0]] =~ attr[1]
390
+ matched = false
391
+ break
392
+ end
393
+ end
394
+ end
395
+ if matched
396
+ for pseudo in negation[:pseudo]
397
+ if pseudo.call(element)
398
+ matched = false
399
+ break
400
+ end
401
+ end
402
+ end
403
+ break unless matched
404
+ end
405
+ end
406
+
407
+ # If element matched but depends on another element (child,
408
+ # sibling, etc), apply the dependent matches instead.
409
+ if matched and @depends
410
+ matches = @depends.call(element, first_only)
411
+ else
412
+ matches = matched ? [element] : nil
413
+ end
414
+
415
+ # If this selector is part of the group, try all the alternative
416
+ # selectors (unless first_only).
417
+ if @alternates and (!first_only or !matches)
418
+ @alternates.each do |alternate|
419
+ break if matches and first_only
420
+ if subset = alternate.match(element, first_only)
421
+ if matches
422
+ matches.concat subset
423
+ else
424
+ matches = subset
425
+ end
426
+ end
427
+ end
428
+ end
429
+
430
+ matches
431
+ end
432
+
433
+
434
+ # :call-seq:
435
+ # select(root) => array
436
+ #
437
+ # Selects and returns an array with all matching elements, beginning
438
+ # with one node and traversing through all children depth-first.
439
+ # Returns an empty array if no match is found.
440
+ #
441
+ # The root node may be any element in the document, or the document
442
+ # itself.
443
+ #
444
+ # For example:
445
+ # selector = HTML::Selector.new "input[type=text]"
446
+ # matches = selector.select(element)
447
+ # matches.each do |match|
448
+ # puts "Found text field with name #{match.attributes['name']}"
449
+ # end
450
+ def select(root)
451
+ matches = []
452
+ stack = [root]
453
+ while node = stack.pop
454
+ if node.tag? && subset = match(node, false)
455
+ subset.each do |match|
456
+ matches << match unless matches.any? { |item| item.equal?(match) }
457
+ end
458
+ elsif children = node.children
459
+ stack.concat children.reverse
460
+ end
461
+ end
462
+ matches
463
+ end
464
+
465
+
466
+ # Similar to #select but returns the first matching element. Returns +nil+
467
+ # if no element matches the selector.
468
+ def select_first(root)
469
+ stack = [root]
470
+ while node = stack.pop
471
+ if node.tag? && subset = match(node, true)
472
+ return subset.first if !subset.empty?
473
+ elsif children = node.children
474
+ stack.concat children.reverse
475
+ end
476
+ end
477
+ nil
478
+ end
479
+
480
+
481
+ def to_s #:nodoc:
482
+ @source
483
+ end
484
+
485
+
486
+ # Return the next element after this one. Skips sibling text nodes.
487
+ #
488
+ # With the +name+ argument, returns the next element with that name,
489
+ # skipping other sibling elements.
490
+ def next_element(element, name = nil)
491
+ if siblings = element.parent.children
492
+ found = false
493
+ siblings.each do |node|
494
+ if node.equal?(element)
495
+ found = true
496
+ elsif found && node.tag?
497
+ return node if (name.nil? || node.name == name)
498
+ end
499
+ end
500
+ end
501
+ nil
502
+ end
503
+
504
+
505
+ protected
506
+
507
+
508
+ # Creates a simple selector given the statement and array of
509
+ # substitution values.
510
+ #
511
+ # Returns a hash with the values +tag_name+, +attributes+,
512
+ # +pseudo+ (classes) and +negation+.
513
+ #
514
+ # Called the first time with +can_negate+ true to allow
515
+ # negation. Called a second time with false since negation
516
+ # cannot be negated.
517
+ def simple_selector(statement, values, can_negate = true)
518
+ tag_name = nil
519
+ attributes = []
520
+ pseudo = []
521
+ negation = []
522
+
523
+ # Element name. (Note that in negation, this can come at
524
+ # any order, but for simplicity we allow if only first).
525
+ statement.sub!(/^(\*|[[:alpha:]][\w\-]*)/) do |match|
526
+ match.strip!
527
+ tag_name = match.downcase unless match == "*"
528
+ @source << match
529
+ "" # Remove
530
+ end
531
+
532
+ # Get identifier, class, attribute name, pseudo or negation.
533
+ while true
534
+ # Element identifier.
535
+ next if statement.sub!(/^#(\?|[\w\-]+)/) do |match|
536
+ id = $1
537
+ if id == "?"
538
+ id = values.shift
539
+ end
540
+ @source << "##{id}"
541
+ id = Regexp.new("^#{Regexp.escape(id.to_s)}$") unless id.is_a?(Regexp)
542
+ attributes << ["id", id]
543
+ "" # Remove
544
+ end
545
+
546
+ # Class name.
547
+ next if statement.sub!(/^\.([\w\-]+)/) do |match|
548
+ class_name = $1
549
+ @source << ".#{class_name}"
550
+ class_name = Regexp.new("(^|\s)#{Regexp.escape(class_name)}($|\s)") unless class_name.is_a?(Regexp)
551
+ attributes << ["class", class_name]
552
+ "" # Remove
553
+ end
554
+
555
+ # Attribute value.
556
+ next if statement.sub!(/^\[\s*([[:alpha:]][\w\-]*)\s*((?:[~|^$*])?=)?\s*('[^']*'|"[^*]"|[^\]]*)\s*\]/) do |match|
557
+ name, equality, value = $1, $2, $3
558
+ if value == "?"
559
+ value = values.shift
560
+ else
561
+ # Handle single and double quotes.
562
+ value.strip!
563
+ if (value[0] == ?" or value[0] == ?') and value[0] == value[-1]
564
+ value = value[1..-2]
565
+ end
566
+ end
567
+ @source << "[#{name}#{equality}'#{value}']"
568
+ attributes << [name.downcase.strip, attribute_match(equality, value)]
569
+ "" # Remove
570
+ end
571
+
572
+ # Root element only.
573
+ next if statement.sub!(/^:root/) do |match|
574
+ pseudo << lambda do |element|
575
+ element.parent.nil? or not element.parent.tag?
576
+ end
577
+ @source << ":root"
578
+ "" # Remove
579
+ end
580
+
581
+ # Nth-child including last and of-type.
582
+ next if statement.sub!(/^:nth-(last-)?(child|of-type)\((odd|even|(\d+|\?)|(-?\d*|\?)?n([+\-]\d+|\?)?)\)/) do |match|
583
+ reverse = $1 == "last-"
584
+ of_type = $2 == "of-type"
585
+ @source << ":nth-#{$1}#{$2}("
586
+ case $3
587
+ when "odd"
588
+ pseudo << nth_child(2, 1, of_type, reverse)
589
+ @source << "odd)"
590
+ when "even"
591
+ pseudo << nth_child(2, 2, of_type, reverse)
592
+ @source << "even)"
593
+ when /^(\d+|\?)$/ # b only
594
+ b = ($1 == "?" ? values.shift : $1).to_i
595
+ pseudo << nth_child(0, b, of_type, reverse)
596
+ @source << "#{b})"
597
+ when /^(-?\d*|\?)?n([+\-]\d+|\?)?$/
598
+ a = ($1 == "?" ? values.shift :
599
+ $1 == "" ? 1 : $1 == "-" ? -1 : $1).to_i
600
+ b = ($2 == "?" ? values.shift : $2).to_i
601
+ pseudo << nth_child(a, b, of_type, reverse)
602
+ @source << (b >= 0 ? "#{a}n+#{b})" : "#{a}n#{b})")
603
+ else
604
+ raise ArgumentError, "Invalid nth-child #{match}"
605
+ end
606
+ "" # Remove
607
+ end
608
+ # First/last child (of type).
609
+ next if statement.sub!(/^:(first|last)-(child|of-type)/) do |match|
610
+ reverse = $1 == "last"
611
+ of_type = $2 == "of-type"
612
+ pseudo << nth_child(0, 1, of_type, reverse)
613
+ @source << ":#{$1}-#{$2}"
614
+ "" # Remove
615
+ end
616
+ # Only child (of type).
617
+ next if statement.sub!(/^:only-(child|of-type)/) do |match|
618
+ of_type = $1 == "of-type"
619
+ pseudo << only_child(of_type)
620
+ @source << ":only-#{$1}"
621
+ "" # Remove
622
+ end
623
+
624
+ # Empty: no child elements or meaningful content (whitespaces
625
+ # are ignored).
626
+ next if statement.sub!(/^:empty/) do |match|
627
+ pseudo << lambda do |element|
628
+ empty = true
629
+ for child in element.children
630
+ if child.tag? or !child.content.strip.empty?
631
+ empty = false
632
+ break
633
+ end
634
+ end
635
+ empty
636
+ end
637
+ @source << ":empty"
638
+ "" # Remove
639
+ end
640
+ # Content: match the text content of the element, stripping
641
+ # leading and trailing spaces.
642
+ next if statement.sub!(/^:content\(\s*(\?|'[^']*'|"[^"]*"|[^)]*)\s*\)/) do |match|
643
+ content = $1
644
+ if content == "?"
645
+ content = values.shift
646
+ elsif (content[0] == ?" or content[0] == ?') and content[0] == content[-1]
647
+ content = content[1..-2]
648
+ end
649
+ @source << ":content('#{content}')"
650
+ content = Regexp.new("^#{Regexp.escape(content.to_s)}$") unless content.is_a?(Regexp)
651
+ pseudo << lambda do |element|
652
+ text = ""
653
+ for child in element.children
654
+ unless child.tag?
655
+ text << child.content
656
+ end
657
+ end
658
+ text.strip =~ content
659
+ end
660
+ "" # Remove
661
+ end
662
+
663
+ # Negation. Create another simple selector to handle it.
664
+ if statement.sub!(/^:not\(\s*/, "")
665
+ raise ArgumentError, "Double negatives are not missing feature" unless can_negate
666
+ @source << ":not("
667
+ negation << simple_selector(statement, values, false)
668
+ raise ArgumentError, "Negation not closed" unless statement.sub!(/^\s*\)/, "")
669
+ @source << ")"
670
+ next
671
+ end
672
+
673
+ # No match: moving on.
674
+ break
675
+ end
676
+
677
+ # Return hash. The keys are mapped to instance variables.
678
+ {:tag_name=>tag_name, :attributes=>attributes, :pseudo=>pseudo, :negation=>negation}
679
+ end
680
+
681
+
682
+ # Create a regular expression to match an attribute value based
683
+ # on the equality operator (=, ^=, |=, etc).
684
+ def attribute_match(equality, value)
685
+ regexp = value.is_a?(Regexp) ? value : Regexp.escape(value.to_s)
686
+ case equality
687
+ when "=" then
688
+ # Match the attribute value in full
689
+ Regexp.new("^#{regexp}$")
690
+ when "~=" then
691
+ # Match a space-separated word within the attribute value
692
+ Regexp.new("(^|\s)#{regexp}($|\s)")
693
+ when "^="
694
+ # Match the beginning of the attribute value
695
+ Regexp.new("^#{regexp}")
696
+ when "$="
697
+ # Match the end of the attribute value
698
+ Regexp.new("#{regexp}$")
699
+ when "*="
700
+ # Match substring of the attribute value
701
+ regexp.is_a?(Regexp) ? regexp : Regexp.new(regexp)
702
+ when "|=" then
703
+ # Match the first space-separated item of the attribute value
704
+ Regexp.new("^#{regexp}($|\s)")
705
+ else
706
+ raise InvalidSelectorError, "Invalid operation/value" unless value.empty?
707
+ # Match all attributes values (existence check)
708
+ //
709
+ end
710
+ end
711
+
712
+
713
+ # Returns a lambda that can match an element against the nth-child
714
+ # pseudo class, given the following arguments:
715
+ # * +a+ -- Value of a part.
716
+ # * +b+ -- Value of b part.
717
+ # * +of_type+ -- True to test only elements of this type (of-type).
718
+ # * +reverse+ -- True to count in reverse order (last-).
719
+ def nth_child(a, b, of_type, reverse)
720
+ # a = 0 means select at index b, if b = 0 nothing selected
721
+ return lambda { |element| false } if a == 0 and b == 0
722
+ # a < 0 and b < 0 will never match against an index
723
+ return lambda { |element| false } if a < 0 and b < 0
724
+ b = a + b + 1 if b < 0 # b < 0 just picks last element from each group
725
+ b -= 1 unless b == 0 # b == 0 is same as b == 1, otherwise zero based
726
+ lambda do |element|
727
+ # Element must be inside parent element.
728
+ return false unless element.parent and element.parent.tag?
729
+ index = 0
730
+ # Get siblings, reverse if counting from last.
731
+ siblings = element.parent.children
732
+ siblings = siblings.reverse if reverse
733
+ # Match element name if of-type, otherwise ignore name.
734
+ name = of_type ? element.name : nil
735
+ found = false
736
+ for child in siblings
737
+ # Skip text nodes/comments.
738
+ if child.tag? and (name == nil or child.name == name)
739
+ if a == 0
740
+ # Shortcut when a == 0 no need to go past count
741
+ if index == b
742
+ found = child.equal?(element)
743
+ break
744
+ end
745
+ elsif a < 0
746
+ # Only look for first b elements
747
+ break if index > b
748
+ if child.equal?(element)
749
+ found = (index % a) == 0
750
+ break
751
+ end
752
+ else
753
+ # Otherwise, break if child found and count == an+b
754
+ if child.equal?(element)
755
+ found = (index % a) == b
756
+ break
757
+ end
758
+ end
759
+ index += 1
760
+ end
761
+ end
762
+ found
763
+ end
764
+ end
765
+
766
+
767
+ # Creates a only child lambda. Pass +of-type+ to only look at
768
+ # elements of its type.
769
+ def only_child(of_type)
770
+ lambda do |element|
771
+ # Element must be inside parent element.
772
+ return false unless element.parent and element.parent.tag?
773
+ name = of_type ? element.name : nil
774
+ other = false
775
+ for child in element.parent.children
776
+ # Skip text nodes/comments.
777
+ if child.tag? and (name == nil or child.name == name)
778
+ unless child.equal?(element)
779
+ other = true
780
+ break
781
+ end
782
+ end
783
+ end
784
+ !other
785
+ end
786
+ end
787
+
788
+
789
+ # Called to create a dependent selector (sibling, descendant, etc).
790
+ # Passes the remainder of the statement that will be reduced to zero
791
+ # eventually, and array of substitution values.
792
+ #
793
+ # This method is called from four places, so it helps to put it here
794
+ # for resue. The only logic deals with the need to detect comma
795
+ # separators (alternate) and apply them to the selector group of the
796
+ # top selector.
797
+ def next_selector(statement, values)
798
+ second = Selector.new(statement, values)
799
+ # If there are alternate selectors, we group them in the top selector.
800
+ if alternates = second.instance_variable_get(:@alternates)
801
+ second.instance_variable_set(:@alternates, nil)
802
+ (@alternates ||= []).concat alternates
803
+ end
804
+ second
805
+ end
806
+
807
+ end
808
+
809
+
810
+ # See HTML::Selector.new
811
+ def self.selector(statement, *values)
812
+ Selector.new(statement, *values)
813
+ end
814
+
815
+
816
+ class Tag
817
+
818
+ def select(selector, *values)
819
+ selector = HTML::Selector.new(selector, values)
820
+ selector.select(self)
821
+ end
822
+
823
+ end
824
+
825
+ end