scrapi 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ module HTML
2
+
3
+ class Node
4
+
5
+ # Returns the next sibling node.
6
+ def next_sibling()
7
+ if siblings = parent.children
8
+ siblings.each_with_index do |node, i|
9
+ return siblings[i + 1] if node.equal?(self)
10
+ end
11
+ end
12
+ nil
13
+ end
14
+
15
+
16
+ # Returns the previous sibling node.
17
+ def previous_sibling()
18
+ if siblings = parent.children
19
+ siblings.each_with_index do |node, i|
20
+ return siblings[i - 1] if node.equal?(self)
21
+ end
22
+ end
23
+ nil
24
+ end
25
+
26
+
27
+ # Return the next element after this one. Skips sibling text nodes.
28
+ #
29
+ # With the +name+ argument, returns the next element with that name,
30
+ # skipping other sibling elements.
31
+ def next_element(name = nil)
32
+ if siblings = parent.children
33
+ found = false
34
+ siblings.each do |node|
35
+ if node.equal?(self)
36
+ found = true
37
+ elsif found && node.tag?
38
+ return node if (name.nil? || node.name == name)
39
+ end
40
+ end
41
+ end
42
+ nil
43
+ end
44
+
45
+
46
+ # Return the previous element before this one. Skips sibling text
47
+ # nodes.
48
+ #
49
+ # Using the +name+ argument, returns the previous element with
50
+ # that name, skipping other sibling elements.
51
+ def previous_element(name = nil)
52
+ if siblings = parent.children
53
+ found = nil
54
+ siblings.each do |node|
55
+ return found if node.equal?(self)
56
+ found = node if node.tag? && (name.nil? || node.name == name)
57
+ end
58
+ end
59
+ nil
60
+ end
61
+
62
+
63
+ # Detach this node from its parent.
64
+ def detach()
65
+ if @parent
66
+ @parent.children.delete_if { |child| child.equal?(self) }
67
+ @parent = nil
68
+ end
69
+ self
70
+ end
71
+
72
+
73
+ # Process each node beginning with the current node.
74
+ def each(value = nil, &block)
75
+ yield self, value
76
+ if @children
77
+ @children.each do |child|
78
+ child.each value, &block
79
+ end
80
+ end
81
+ value
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,825 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ module HTML
9
+
10
+
11
+ # Selects HTML elements using CSS 2 selectors.
12
+ #
13
+ # The +Selector+ class uses CSS selector expressions to match and select
14
+ # HTML elements.
15
+ #
16
+ # For example:
17
+ # selector = HTML::Selector.new "form.login[action=/login]"
18
+ # creates a new selector that matches any +form+ element with the class
19
+ # +login+ and an attribute +action+ with the value <tt>/login</tt>.
20
+ #
21
+ # === Matching Elements
22
+ #
23
+ # Use the #match method to determine if an element matches the selector.
24
+ #
25
+ # For simple selectors, the method returns an array with that element,
26
+ # or +nil+ if the element does not match. For complex selectors (see below)
27
+ # the method returns an array with all matched elements, of +nil+ if no
28
+ # match found.
29
+ #
30
+ # For example:
31
+ # if selector.match(element)
32
+ # puts "Element is a login form"
33
+ # end
34
+ #
35
+ # === Selecting Elements
36
+ #
37
+ # Use the #select method to select all matching elements starting with
38
+ # one element and going through all children in depth-first order.
39
+ #
40
+ # This method returns an array of all matching elements, an empty array
41
+ # if no match is found
42
+ #
43
+ # For example:
44
+ # selector = HTML::Selector.new "input[type=text]"
45
+ # matches = selector.select(element)
46
+ # matches.each do |match|
47
+ # puts "Found text field with name #{match.attributes['name']}"
48
+ # end
49
+ #
50
+ # === Expressions
51
+ #
52
+ # Selectors can match elements using any of the following criteria:
53
+ # * <tt>name</tt> -- Match an element based on its name (tag name).
54
+ # For example, <tt>p</tt> to match a paragraph. You can use <tt>*</tt>
55
+ # to match any element.
56
+ # * <tt>#</tt><tt>id</tt> -- Match an element based on its identifier (the
57
+ # <tt>id</tt> attribute). For example, <tt>#</tt><tt>page</tt>.
58
+ # * <tt>.class</tt> -- Match an element based on its class name, all
59
+ # class names if more than one specified.
60
+ # * <tt>[attr]</tt> -- Match an element that has the specified attribute.
61
+ # * <tt>[attr=value]</tt> -- Match an element that has the specified
62
+ # attribute and value. (More operators are supported see below)
63
+ # * <tt>:pseudo-class</tt> -- Match an element based on a pseudo class,
64
+ # such as <tt>:nth-child</tt> and <tt>:empty</tt>.
65
+ # * <tt>:not(expr)</tt> -- Match an element that does not match the
66
+ # negation expression.
67
+ #
68
+ # When using a combination of the above, the element name comes first
69
+ # followed by identifier, class names, attributes, pseudo classes and
70
+ # negation in any order. Do not seprate these parts with spaces!
71
+ # Space separation is used for descendant selectors.
72
+ #
73
+ # For example:
74
+ # selector = HTML::Selector.new "form.login[action=/login]"
75
+ # The matched element must be of type +form+ and have the class +login+.
76
+ # It may have other classes, but the class +login+ is required to match.
77
+ # It must also have an attribute called +action+ with the value
78
+ # <tt>/login</tt>.
79
+ #
80
+ # This selector will match the following element:
81
+ # <form class="login form" method="post" action="/login">
82
+ # but will not match the element:
83
+ # <form method="post" action="/logout">
84
+ #
85
+ # === Attribute Values
86
+ #
87
+ # Several operators are supported for matching attributes:
88
+ # * <tt>name</tt> -- The element must have an attribute with that name.
89
+ # * <tt>name=value</tt> -- The element must have an attribute with that
90
+ # name and value.
91
+ # * <tt>name^=value</tt> -- The attribute value must start with the
92
+ # specified value.
93
+ # * <tt>name$=value</tt> -- The attribute value must end with the
94
+ # specified value.
95
+ # * <tt>name*=value</tt> -- The attribute value must contain the
96
+ # specified value.
97
+ # * <tt>name~=word</tt> -- The attribute value must contain the specified
98
+ # word (space separated).
99
+ # * <tt>name|=word</tt> -- The attribute value must start with specified
100
+ # word.
101
+ #
102
+ # For example, the following two selectors match the same element:
103
+ # #my_id
104
+ # [id=my_id]
105
+ # and so do the following two selectors:
106
+ # .my_class
107
+ # [class~=my_class]
108
+ #
109
+ # === Alternatives, siblings, children
110
+ #
111
+ # Complex selectors use a combination of expressions to match elements:
112
+ # * <tt>expr1 expr2</tt> -- Match any element against the second expression
113
+ # if it has some parent element that matches the first expression.
114
+ # * <tt>expr1 > expr2</tt> -- Match any element against the second expression
115
+ # if it is the child of an element that matches the first expression.
116
+ # * <tt>expr1 + expr2</tt> -- Match any element against the second expression
117
+ # if it immediately follows an element that matches the first expression.
118
+ # * <tt>expr1 ~ expr2</tt> -- Match any element against the second expression
119
+ # that comes after an element that matches the first expression.
120
+ # * <tt>expr1, expr2</tt> -- Match any element against the first expression,
121
+ # or against the second expression.
122
+ #
123
+ # Since children and sibling selectors may match more than one element given
124
+ # the first element, the #match method may return more than one match.
125
+ #
126
+ # === Pseudo classes
127
+ #
128
+ # Pseudo classes were introduced in CSS 3. They are most often used to select
129
+ # elements in a given position:
130
+ # * <tt>:root</tt> -- Match the element only if it is the root element
131
+ # (no parent element).
132
+ # * <tt>:empty</tt> -- Match the element only if it has no child elements,
133
+ # and no text content.
134
+ # * <tt>:only-child</tt> -- Match the element if it is the only child (element)
135
+ # of its parent element.
136
+ # * <tt>:only-of-type</tt> -- Match the element if it is the only child (element)
137
+ # of its parent element and its type.
138
+ # * <tt>:first-child</tt> -- Match the element if it is the first child (element)
139
+ # of its parent element.
140
+ # * <tt>:first-of-type</tt> -- Match the element if it is the first child (element)
141
+ # of its parent element of its type.
142
+ # * <tt>:last-child</tt> -- Match the element if it is the last child (element)
143
+ # of its parent element.
144
+ # * <tt>:last-of-type</tt> -- Match the element if it is the last child (element)
145
+ # of its parent element of its type.
146
+ # * <tt>:nth-child(b)</tt> -- Match the element if it is the b-th child (element)
147
+ # of its parent element. The value <tt>b</tt> specifies its index, starting with 1.
148
+ # * <tt>:nth-child(an+b)</tt> -- Match the element if it is the b-th child (element)
149
+ # in each group of <tt>a</tt> child elements of its parent element.
150
+ # * <tt>:nth-child(-an+b)</tt> -- Match the element if it is the first child (element)
151
+ # in each group of <tt>a</tt> child elements, up to the first <tt>b</tt> child
152
+ # elements of its parent element.
153
+ # * <tt>:nth-child(odd)</tt> -- Match element in the odd position (i.e. first, third).
154
+ # Same as <tt>:nth-child(2n+1)</tt>.
155
+ # * <tt>:nth-child(even)</tt> -- Match element in the even position (i.e. second,
156
+ # fourth). Same as <tt>:nth-child(2n+2)</tt>.
157
+ # * <tt>:nth-of-type(..)</tt> -- As above, but only counts elements of its type.
158
+ # * <tt>:nth-last-child(..)</tt> -- As above, but counts from the last child.
159
+ # * <tt>:nth-last-of-type(..)</tt> -- As above, but counts from the last child and
160
+ # only elements of its type.
161
+ # * <tt>:not(selector)</tt> -- Match the element only if the element does not
162
+ # match the simple selector.
163
+ #
164
+ # As you can see, <tt>:nth-child<tt> pseudo class and its varient can get quite
165
+ # tricky and the CSS specification doesn't do a much better job explaining it.
166
+ # But after reading the examples and trying a few combinations, it's easy to
167
+ # figure out.
168
+ #
169
+ # For example:
170
+ # table tr:nth-child(odd)
171
+ # Selects every second row in the table starting with the first one.
172
+ #
173
+ # div p:nth-child(4)
174
+ # Selects the fourth paragraph in the +div+, but not if the +div+ contains
175
+ # other elements, since those are also counted.
176
+ #
177
+ # div p:nth-of-type(4)
178
+ # Selects the fourth paragraph in the +div+, counting only paragraphs, and
179
+ # ignoring all other elements.
180
+ #
181
+ # div p:nth-of-type(-n+4)
182
+ # Selects the first four paragraphs, ignoring all others.
183
+ #
184
+ # And you can always select an element that matches one set of rules but
185
+ # not another using <tt>:not</tt>. For example:
186
+ # p:not(.post)
187
+ # Matches all paragraphs that do not have the class <tt>.post</tt>.
188
+ #
189
+ # === Substitution Values
190
+ #
191
+ # You can use substitution with identifiers, class names and element values.
192
+ # A substitution takes the form of a question mark (<tt>?</tt>) and uses the
193
+ # next value in the argument list following the CSS expression.
194
+ #
195
+ # The substitution value may be a string or a regular expression. All other
196
+ # values are converted to strings.
197
+ #
198
+ # For example:
199
+ # selector = HTML::Selector.new "#?", /^\d+$/
200
+ # matches any element whose identifier consists of one or more digits.
201
+ #
202
+ # See http://www.w3.org/TR/css3-selectors/
203
+ class Selector
204
+
205
+
206
+ # An invalid selector.
207
+ class InvalidSelectorError < StandardError ; end
208
+
209
+
210
+ class << self
211
+
212
+ # :call-seq:
213
+ # Selector.for_class(cls) => selector
214
+ #
215
+ # Creates a new selector for the given class name.
216
+ def for_class(cls)
217
+ self.new([".?", cls])
218
+ end
219
+
220
+
221
+ # :call-seq:
222
+ # Selector.for_id(id) => selector
223
+ #
224
+ # Creates a new selector for the given id.
225
+ def for_id(id)
226
+ self.new(["#?", id])
227
+ end
228
+
229
+ end
230
+
231
+
232
+ # :call-seq:
233
+ # Selector.new(string, [values ...]) => selector
234
+ #
235
+ # Creates a new selector from a CSS 2 selector expression.
236
+ #
237
+ # The first argument is the selector expression. All other arguments
238
+ # are used for value substitution.
239
+ #
240
+ # Throws InvalidSelectorError is the selector expression is invalid.
241
+ def initialize(selector, *values)
242
+ raise ArgumentError, "CSS expression cannot be empty" if selector.empty?
243
+ @source = ""
244
+ values = values[0] if values.size == 1 and values[0].is_a?(Array)
245
+ # We need a copy to determine if we failed to parse, and also
246
+ # preserve the original pass by-ref statement.
247
+ statement = selector.strip.dup
248
+ # Create a simple selector, along with negation.
249
+ simple_selector(statement, values).each { |name, value| instance_variable_set("@#{name}", value) }
250
+
251
+ # Alternative selector.
252
+ if statement.sub!(/^\s*,\s*/, "")
253
+ second = Selector.new(statement, values)
254
+ (@alternates ||= []) << second
255
+ # If there are alternate selectors, we group them in the top selector.
256
+ if alternates = second.instance_variable_get(:@alternates)
257
+ second.instance_variable_set(:@alternates, nil)
258
+ @alternates.concat alternates
259
+ end
260
+ @source << " , " << second.to_s
261
+ # Sibling selector: create a dependency into second selector that will
262
+ # match element immediately following this one.
263
+ elsif statement.sub!(/^\s*\+\s*/, "")
264
+ second = next_selector(statement, values)
265
+ @depends = lambda do |element, first|
266
+ if element = next_element(element)
267
+ second.match(element, first)
268
+ end
269
+ end
270
+ @source << " + " << second.to_s
271
+ # Adjacent selector: create a dependency into second selector that will
272
+ # match all elements following this one.
273
+ elsif statement.sub!(/^\s*~\s*/, "")
274
+ second = next_selector(statement, values)
275
+ @depends = lambda do |element, first|
276
+ matches = []
277
+ while element = next_element(element)
278
+ if subset = second.match(element, first)
279
+ if first && !subset.empty?
280
+ matches << subset.first
281
+ break
282
+ else
283
+ matches.concat subset
284
+ end
285
+ end
286
+ end
287
+ matches.empty? ? nil : matches
288
+ end
289
+ @source << " ~ " << second.to_s
290
+ # Child selector: create a dependency into second selector that will
291
+ # match a child element of this one.
292
+ elsif statement.sub!(/^\s*>\s*/, "")
293
+ second = next_selector(statement, values)
294
+ @depends = lambda do |element, first|
295
+ matches = []
296
+ element.children.each do |child|
297
+ if child.tag? and subset = second.match(child, first)
298
+ if first && !subset.empty?
299
+ matches << subset.first
300
+ break
301
+ else
302
+ matches.concat subset
303
+ end
304
+ end
305
+ end
306
+ matches.empty? ? nil : matches
307
+ end
308
+ @source << " > " << second.to_s
309
+ # Descendant selector: create a dependency into second selector that
310
+ # will match all descendant elements of this one. Note,
311
+ elsif statement =~ /^\s+\S+/ and statement != selector
312
+ second = next_selector(statement, values)
313
+ @depends = lambda do |element, first|
314
+ matches = []
315
+ stack = element.children.reverse
316
+ while node = stack.pop
317
+ next unless node.tag?
318
+ if subset = second.match(node, first)
319
+ if first && !subset.empty?
320
+ matches << subset.first
321
+ break
322
+ else
323
+ matches.concat subset
324
+ end
325
+ elsif children = node.children
326
+ stack.concat children.reverse
327
+ end
328
+ end
329
+ matches.empty? ? nil : matches
330
+ end
331
+ @source << " " << second.to_s
332
+ else
333
+ # The last selector is where we check that we parsed
334
+ # all the parts.
335
+ unless statement.empty? or statement.strip.empty?
336
+ raise ArgumentError, "Invalid selector: #{statement}"
337
+ end
338
+ end
339
+ end
340
+
341
+
342
+ # :call-seq:
343
+ # match(element, first?) => array or nil
344
+ #
345
+ # Matches an element against the selector.
346
+ #
347
+ # For a simple selector this method returns an array with the
348
+ # element if the element matches, nil otherwise.
349
+ #
350
+ # For a complex selector (sibling and descendant) this method
351
+ # returns an array with all matching elements, nil if no match is
352
+ # found.
353
+ #
354
+ # Use +first_only=true+ if you are only interested in the first element.
355
+ #
356
+ # For example:
357
+ # if selector.match(element)
358
+ # puts "Element is a login form"
359
+ # end
360
+ def match(element, first_only = false)
361
+ # Match element if no element name or element name same as element name
362
+ if matched = (!@tag_name or @tag_name == element.name)
363
+ # No match if one of the attribute matches failed
364
+ for attr in @attributes
365
+ if element.attributes[attr[0]] !~ attr[1]
366
+ matched = false
367
+ break
368
+ end
369
+ end
370
+ end
371
+
372
+ # Pseudo class matches (nth-child, empty, etc).
373
+ if matched
374
+ for pseudo in @pseudo
375
+ unless pseudo.call(element)
376
+ matched = false
377
+ break
378
+ end
379
+ end
380
+ end
381
+
382
+ # Negation. Same rules as above, but we fail if a match is made.
383
+ if matched and @negation
384
+ for negation in @negation
385
+ if negation[:tag_name] == element.name
386
+ matched = false
387
+ else
388
+ for attr in negation[:attributes]
389
+ if element.attributes[attr[0]] =~ attr[1]
390
+ matched = false
391
+ break
392
+ end
393
+ end
394
+ end
395
+ if matched
396
+ for pseudo in negation[:pseudo]
397
+ if pseudo.call(element)
398
+ matched = false
399
+ break
400
+ end
401
+ end
402
+ end
403
+ break unless matched
404
+ end
405
+ end
406
+
407
+ # If element matched but depends on another element (child,
408
+ # sibling, etc), apply the dependent matches instead.
409
+ if matched and @depends
410
+ matches = @depends.call(element, first_only)
411
+ else
412
+ matches = matched ? [element] : nil
413
+ end
414
+
415
+ # If this selector is part of the group, try all the alternative
416
+ # selectors (unless first_only).
417
+ if @alternates and (!first_only or !matches)
418
+ @alternates.each do |alternate|
419
+ break if matches and first_only
420
+ if subset = alternate.match(element, first_only)
421
+ if matches
422
+ matches.concat subset
423
+ else
424
+ matches = subset
425
+ end
426
+ end
427
+ end
428
+ end
429
+
430
+ matches
431
+ end
432
+
433
+
434
+ # :call-seq:
435
+ # select(root) => array
436
+ #
437
+ # Selects and returns an array with all matching elements, beginning
438
+ # with one node and traversing through all children depth-first.
439
+ # Returns an empty array if no match is found.
440
+ #
441
+ # The root node may be any element in the document, or the document
442
+ # itself.
443
+ #
444
+ # For example:
445
+ # selector = HTML::Selector.new "input[type=text]"
446
+ # matches = selector.select(element)
447
+ # matches.each do |match|
448
+ # puts "Found text field with name #{match.attributes['name']}"
449
+ # end
450
+ def select(root)
451
+ matches = []
452
+ stack = [root]
453
+ while node = stack.pop
454
+ if node.tag? && subset = match(node, false)
455
+ subset.each do |match|
456
+ matches << match unless matches.any? { |item| item.equal?(match) }
457
+ end
458
+ elsif children = node.children
459
+ stack.concat children.reverse
460
+ end
461
+ end
462
+ matches
463
+ end
464
+
465
+
466
+ # Similar to #select but returns the first matching element. Returns +nil+
467
+ # if no element matches the selector.
468
+ def select_first(root)
469
+ stack = [root]
470
+ while node = stack.pop
471
+ if node.tag? && subset = match(node, true)
472
+ return subset.first if !subset.empty?
473
+ elsif children = node.children
474
+ stack.concat children.reverse
475
+ end
476
+ end
477
+ nil
478
+ end
479
+
480
+
481
+ def to_s #:nodoc:
482
+ @source
483
+ end
484
+
485
+
486
+ # Return the next element after this one. Skips sibling text nodes.
487
+ #
488
+ # With the +name+ argument, returns the next element with that name,
489
+ # skipping other sibling elements.
490
+ def next_element(element, name = nil)
491
+ if siblings = element.parent.children
492
+ found = false
493
+ siblings.each do |node|
494
+ if node.equal?(element)
495
+ found = true
496
+ elsif found && node.tag?
497
+ return node if (name.nil? || node.name == name)
498
+ end
499
+ end
500
+ end
501
+ nil
502
+ end
503
+
504
+
505
+ protected
506
+
507
+
508
+ # Creates a simple selector given the statement and array of
509
+ # substitution values.
510
+ #
511
+ # Returns a hash with the values +tag_name+, +attributes+,
512
+ # +pseudo+ (classes) and +negation+.
513
+ #
514
+ # Called the first time with +can_negate+ true to allow
515
+ # negation. Called a second time with false since negation
516
+ # cannot be negated.
517
+ def simple_selector(statement, values, can_negate = true)
518
+ tag_name = nil
519
+ attributes = []
520
+ pseudo = []
521
+ negation = []
522
+
523
+ # Element name. (Note that in negation, this can come at
524
+ # any order, but for simplicity we allow if only first).
525
+ statement.sub!(/^(\*|[[:alpha:]][\w\-]*)/) do |match|
526
+ match.strip!
527
+ tag_name = match.downcase unless match == "*"
528
+ @source << match
529
+ "" # Remove
530
+ end
531
+
532
+ # Get identifier, class, attribute name, pseudo or negation.
533
+ while true
534
+ # Element identifier.
535
+ next if statement.sub!(/^#(\?|[\w\-]+)/) do |match|
536
+ id = $1
537
+ if id == "?"
538
+ id = values.shift
539
+ end
540
+ @source << "##{id}"
541
+ id = Regexp.new("^#{Regexp.escape(id.to_s)}$") unless id.is_a?(Regexp)
542
+ attributes << ["id", id]
543
+ "" # Remove
544
+ end
545
+
546
+ # Class name.
547
+ next if statement.sub!(/^\.([\w\-]+)/) do |match|
548
+ class_name = $1
549
+ @source << ".#{class_name}"
550
+ class_name = Regexp.new("(^|\s)#{Regexp.escape(class_name)}($|\s)") unless class_name.is_a?(Regexp)
551
+ attributes << ["class", class_name]
552
+ "" # Remove
553
+ end
554
+
555
+ # Attribute value.
556
+ next if statement.sub!(/^\[\s*([[:alpha:]][\w\-]*)\s*((?:[~|^$*])?=)?\s*('[^']*'|"[^*]"|[^\]]*)\s*\]/) do |match|
557
+ name, equality, value = $1, $2, $3
558
+ if value == "?"
559
+ value = values.shift
560
+ else
561
+ # Handle single and double quotes.
562
+ value.strip!
563
+ if (value[0] == ?" or value[0] == ?') and value[0] == value[-1]
564
+ value = value[1..-2]
565
+ end
566
+ end
567
+ @source << "[#{name}#{equality}'#{value}']"
568
+ attributes << [name.downcase.strip, attribute_match(equality, value)]
569
+ "" # Remove
570
+ end
571
+
572
+ # Root element only.
573
+ next if statement.sub!(/^:root/) do |match|
574
+ pseudo << lambda do |element|
575
+ element.parent.nil? or not element.parent.tag?
576
+ end
577
+ @source << ":root"
578
+ "" # Remove
579
+ end
580
+
581
+ # Nth-child including last and of-type.
582
+ next if statement.sub!(/^:nth-(last-)?(child|of-type)\((odd|even|(\d+|\?)|(-?\d*|\?)?n([+\-]\d+|\?)?)\)/) do |match|
583
+ reverse = $1 == "last-"
584
+ of_type = $2 == "of-type"
585
+ @source << ":nth-#{$1}#{$2}("
586
+ case $3
587
+ when "odd"
588
+ pseudo << nth_child(2, 1, of_type, reverse)
589
+ @source << "odd)"
590
+ when "even"
591
+ pseudo << nth_child(2, 2, of_type, reverse)
592
+ @source << "even)"
593
+ when /^(\d+|\?)$/ # b only
594
+ b = ($1 == "?" ? values.shift : $1).to_i
595
+ pseudo << nth_child(0, b, of_type, reverse)
596
+ @source << "#{b})"
597
+ when /^(-?\d*|\?)?n([+\-]\d+|\?)?$/
598
+ a = ($1 == "?" ? values.shift :
599
+ $1 == "" ? 1 : $1 == "-" ? -1 : $1).to_i
600
+ b = ($2 == "?" ? values.shift : $2).to_i
601
+ pseudo << nth_child(a, b, of_type, reverse)
602
+ @source << (b >= 0 ? "#{a}n+#{b})" : "#{a}n#{b})")
603
+ else
604
+ raise ArgumentError, "Invalid nth-child #{match}"
605
+ end
606
+ "" # Remove
607
+ end
608
+ # First/last child (of type).
609
+ next if statement.sub!(/^:(first|last)-(child|of-type)/) do |match|
610
+ reverse = $1 == "last"
611
+ of_type = $2 == "of-type"
612
+ pseudo << nth_child(0, 1, of_type, reverse)
613
+ @source << ":#{$1}-#{$2}"
614
+ "" # Remove
615
+ end
616
+ # Only child (of type).
617
+ next if statement.sub!(/^:only-(child|of-type)/) do |match|
618
+ of_type = $1 == "of-type"
619
+ pseudo << only_child(of_type)
620
+ @source << ":only-#{$1}"
621
+ "" # Remove
622
+ end
623
+
624
+ # Empty: no child elements or meaningful content (whitespaces
625
+ # are ignored).
626
+ next if statement.sub!(/^:empty/) do |match|
627
+ pseudo << lambda do |element|
628
+ empty = true
629
+ for child in element.children
630
+ if child.tag? or !child.content.strip.empty?
631
+ empty = false
632
+ break
633
+ end
634
+ end
635
+ empty
636
+ end
637
+ @source << ":empty"
638
+ "" # Remove
639
+ end
640
+ # Content: match the text content of the element, stripping
641
+ # leading and trailing spaces.
642
+ next if statement.sub!(/^:content\(\s*(\?|'[^']*'|"[^"]*"|[^)]*)\s*\)/) do |match|
643
+ content = $1
644
+ if content == "?"
645
+ content = values.shift
646
+ elsif (content[0] == ?" or content[0] == ?') and content[0] == content[-1]
647
+ content = content[1..-2]
648
+ end
649
+ @source << ":content('#{content}')"
650
+ content = Regexp.new("^#{Regexp.escape(content.to_s)}$") unless content.is_a?(Regexp)
651
+ pseudo << lambda do |element|
652
+ text = ""
653
+ for child in element.children
654
+ unless child.tag?
655
+ text << child.content
656
+ end
657
+ end
658
+ text.strip =~ content
659
+ end
660
+ "" # Remove
661
+ end
662
+
663
+ # Negation. Create another simple selector to handle it.
664
+ if statement.sub!(/^:not\(\s*/, "")
665
+ raise ArgumentError, "Double negatives are not missing feature" unless can_negate
666
+ @source << ":not("
667
+ negation << simple_selector(statement, values, false)
668
+ raise ArgumentError, "Negation not closed" unless statement.sub!(/^\s*\)/, "")
669
+ @source << ")"
670
+ next
671
+ end
672
+
673
+ # No match: moving on.
674
+ break
675
+ end
676
+
677
+ # Return hash. The keys are mapped to instance variables.
678
+ {:tag_name=>tag_name, :attributes=>attributes, :pseudo=>pseudo, :negation=>negation}
679
+ end
680
+
681
+
682
+ # Create a regular expression to match an attribute value based
683
+ # on the equality operator (=, ^=, |=, etc).
684
+ def attribute_match(equality, value)
685
+ regexp = value.is_a?(Regexp) ? value : Regexp.escape(value.to_s)
686
+ case equality
687
+ when "=" then
688
+ # Match the attribute value in full
689
+ Regexp.new("^#{regexp}$")
690
+ when "~=" then
691
+ # Match a space-separated word within the attribute value
692
+ Regexp.new("(^|\s)#{regexp}($|\s)")
693
+ when "^="
694
+ # Match the beginning of the attribute value
695
+ Regexp.new("^#{regexp}")
696
+ when "$="
697
+ # Match the end of the attribute value
698
+ Regexp.new("#{regexp}$")
699
+ when "*="
700
+ # Match substring of the attribute value
701
+ regexp.is_a?(Regexp) ? regexp : Regexp.new(regexp)
702
+ when "|=" then
703
+ # Match the first space-separated item of the attribute value
704
+ Regexp.new("^#{regexp}($|\s)")
705
+ else
706
+ raise InvalidSelectorError, "Invalid operation/value" unless value.empty?
707
+ # Match all attributes values (existence check)
708
+ //
709
+ end
710
+ end
711
+
712
+
713
+ # Returns a lambda that can match an element against the nth-child
714
+ # pseudo class, given the following arguments:
715
+ # * +a+ -- Value of a part.
716
+ # * +b+ -- Value of b part.
717
+ # * +of_type+ -- True to test only elements of this type (of-type).
718
+ # * +reverse+ -- True to count in reverse order (last-).
719
+ def nth_child(a, b, of_type, reverse)
720
+ # a = 0 means select at index b, if b = 0 nothing selected
721
+ return lambda { |element| false } if a == 0 and b == 0
722
+ # a < 0 and b < 0 will never match against an index
723
+ return lambda { |element| false } if a < 0 and b < 0
724
+ b = a + b + 1 if b < 0 # b < 0 just picks last element from each group
725
+ b -= 1 unless b == 0 # b == 0 is same as b == 1, otherwise zero based
726
+ lambda do |element|
727
+ # Element must be inside parent element.
728
+ return false unless element.parent and element.parent.tag?
729
+ index = 0
730
+ # Get siblings, reverse if counting from last.
731
+ siblings = element.parent.children
732
+ siblings = siblings.reverse if reverse
733
+ # Match element name if of-type, otherwise ignore name.
734
+ name = of_type ? element.name : nil
735
+ found = false
736
+ for child in siblings
737
+ # Skip text nodes/comments.
738
+ if child.tag? and (name == nil or child.name == name)
739
+ if a == 0
740
+ # Shortcut when a == 0 no need to go past count
741
+ if index == b
742
+ found = child.equal?(element)
743
+ break
744
+ end
745
+ elsif a < 0
746
+ # Only look for first b elements
747
+ break if index > b
748
+ if child.equal?(element)
749
+ found = (index % a) == 0
750
+ break
751
+ end
752
+ else
753
+ # Otherwise, break if child found and count == an+b
754
+ if child.equal?(element)
755
+ found = (index % a) == b
756
+ break
757
+ end
758
+ end
759
+ index += 1
760
+ end
761
+ end
762
+ found
763
+ end
764
+ end
765
+
766
+
767
+ # Creates a only child lambda. Pass +of-type+ to only look at
768
+ # elements of its type.
769
+ def only_child(of_type)
770
+ lambda do |element|
771
+ # Element must be inside parent element.
772
+ return false unless element.parent and element.parent.tag?
773
+ name = of_type ? element.name : nil
774
+ other = false
775
+ for child in element.parent.children
776
+ # Skip text nodes/comments.
777
+ if child.tag? and (name == nil or child.name == name)
778
+ unless child.equal?(element)
779
+ other = true
780
+ break
781
+ end
782
+ end
783
+ end
784
+ !other
785
+ end
786
+ end
787
+
788
+
789
+ # Called to create a dependent selector (sibling, descendant, etc).
790
+ # Passes the remainder of the statement that will be reduced to zero
791
+ # eventually, and array of substitution values.
792
+ #
793
+ # This method is called from four places, so it helps to put it here
794
+ # for resue. The only logic deals with the need to detect comma
795
+ # separators (alternate) and apply them to the selector group of the
796
+ # top selector.
797
+ def next_selector(statement, values)
798
+ second = Selector.new(statement, values)
799
+ # If there are alternate selectors, we group them in the top selector.
800
+ if alternates = second.instance_variable_get(:@alternates)
801
+ second.instance_variable_set(:@alternates, nil)
802
+ (@alternates ||= []).concat alternates
803
+ end
804
+ second
805
+ end
806
+
807
+ end
808
+
809
+
810
+ # See HTML::Selector.new
811
+ def self.selector(statement, *values)
812
+ Selector.new(statement, *values)
813
+ end
814
+
815
+
816
+ class Tag
817
+
818
+ def select(selector, *values)
819
+ selector = HTML::Selector.new(selector, values)
820
+ selector.select(self)
821
+ end
822
+
823
+ end
824
+
825
+ end