assaf-scrapi 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,534 @@
1
+ require 'strscan'
2
+
3
+ module HTML #:nodoc:
4
+
5
+ class Conditions < Hash #:nodoc:
6
+ def initialize(hash)
7
+ super()
8
+ hash = { :content => hash } unless Hash === hash
9
+ hash = keys_to_symbols(hash)
10
+ hash.each do |k,v|
11
+ case k
12
+ when :tag, :content then
13
+ # keys are valid, and require no further processing
14
+ when :attributes then
15
+ hash[k] = keys_to_strings(v)
16
+ when :parent, :child, :ancestor, :descendant, :sibling, :before,
17
+ :after
18
+ hash[k] = Conditions.new(v)
19
+ when :children
20
+ hash[k] = v = keys_to_symbols(v)
21
+ v.each do |k,v2|
22
+ case k
23
+ when :count, :greater_than, :less_than
24
+ # keys are valid, and require no further processing
25
+ when :only
26
+ v[k] = Conditions.new(v2)
27
+ else
28
+ raise "illegal key #{k.inspect} => #{v2.inspect}"
29
+ end
30
+ end
31
+ else
32
+ raise "illegal key #{k.inspect} => #{v.inspect}"
33
+ end
34
+ end
35
+ update hash
36
+ end
37
+
38
+ private
39
+
40
+ def keys_to_strings(hash)
41
+ hash.keys.inject({}) do |h,k|
42
+ h[k.to_s] = hash[k]
43
+ h
44
+ end
45
+ end
46
+
47
+ def keys_to_symbols(hash)
48
+ hash.keys.inject({}) do |h,k|
49
+ raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
50
+ h[k.to_sym] = hash[k]
51
+ h
52
+ end
53
+ end
54
+ end
55
+
56
+ # The base class of all nodes, textual and otherwise, in an HTML document.
57
+ class Node #:nodoc:
58
+ # The array of children of this node. Not all nodes have children.
59
+ attr_reader :children
60
+
61
+ # The parent node of this node. All nodes have a parent, except for the
62
+ # root node.
63
+ attr_reader :parent
64
+
65
+ # The line number of the input where this node was begun
66
+ attr_reader :line
67
+
68
+ # The byte position in the input where this node was begun
69
+ attr_reader :position
70
+
71
+ # Create a new node as a child of the given parent.
72
+ def initialize(parent, line=0, pos=0)
73
+ @parent = parent
74
+ @children = []
75
+ @line, @position = line, pos
76
+ end
77
+
78
+ # Return a textual representation of the node.
79
+ def to_s
80
+ s = ""
81
+ @children.each { |child| s << child.to_s }
82
+ s
83
+ end
84
+
85
+ # Return false (subclasses must override this to provide specific matching
86
+ # behavior.) +conditions+ may be of any type.
87
+ def match(conditions)
88
+ false
89
+ end
90
+
91
+ # Search the children of this node for the first node for which #find
92
+ # returns non +nil+. Returns the result of the #find call that succeeded.
93
+ def find(conditions)
94
+ conditions = validate_conditions(conditions)
95
+
96
+ @children.each do |child|
97
+ node = child.find(conditions)
98
+ return node if node
99
+ end
100
+ nil
101
+ end
102
+
103
+ # Search for all nodes that match the given conditions, and return them
104
+ # as an array.
105
+ def find_all(conditions)
106
+ conditions = validate_conditions(conditions)
107
+
108
+ matches = []
109
+ matches << self if match(conditions)
110
+ @children.each do |child|
111
+ matches.concat child.find_all(conditions)
112
+ end
113
+ matches
114
+ end
115
+
116
+ # Returns +false+. Subclasses may override this if they define a kind of
117
+ # tag.
118
+ def tag?
119
+ false
120
+ end
121
+
122
+ def validate_conditions(conditions)
123
+ Conditions === conditions ? conditions : Conditions.new(conditions)
124
+ end
125
+
126
+ def ==(node)
127
+ return false unless self.class == node.class && children.size == node.children.size
128
+
129
+ equivalent = true
130
+
131
+ children.size.times do |i|
132
+ equivalent &&= children[i] == node.children[i]
133
+ end
134
+
135
+ equivalent
136
+ end
137
+
138
+ class <<self
139
+ def parse(parent, line, pos, content, strict=true)
140
+ if content !~ /^<\S/
141
+ Text.new(parent, line, pos, content)
142
+ else
143
+ scanner = StringScanner.new(content)
144
+
145
+ unless scanner.skip(/</)
146
+ if strict
147
+ raise "expected <"
148
+ else
149
+ return Text.new(parent, line, pos, content)
150
+ end
151
+ end
152
+
153
+ if scanner.skip(/!\[CDATA\[/)
154
+ scanner.scan_until(/\]\]>/)
155
+ return CDATA.new(parent, line, pos, scanner.pre_match)
156
+ end
157
+
158
+ closing = ( scanner.scan(/\//) ? :close : nil )
159
+ return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:]+/)
160
+ name.downcase!
161
+
162
+ unless closing
163
+ scanner.skip(/\s*/)
164
+ attributes = {}
165
+ while attr = scanner.scan(/[-\w:]+/)
166
+ value = true
167
+ if scanner.scan(/\s*=\s*/)
168
+ if delim = scanner.scan(/['"]/)
169
+ value = ""
170
+ while text = scanner.scan(/[^#{delim}\\]+|./)
171
+ case text
172
+ when "\\" then
173
+ value << text
174
+ value << scanner.getch
175
+ when delim
176
+ break
177
+ else value << text
178
+ end
179
+ end
180
+ else
181
+ value = scanner.scan(/[^\s>\/]+/)
182
+ end
183
+ end
184
+ attributes[attr.downcase] = value
185
+ scanner.skip(/\s*/)
186
+ end
187
+
188
+ closing = ( scanner.scan(/\//) ? :self : nil )
189
+ end
190
+
191
+ unless scanner.scan(/\s*>/)
192
+ if strict
193
+ raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
194
+ else
195
+ # throw away all text until we find what we're looking for
196
+ scanner.skip_until(/>/) or scanner.terminate
197
+ end
198
+ end
199
+
200
+ Tag.new(parent, line, pos, name, attributes, closing)
201
+ end
202
+ end
203
+ end
204
+ end
205
+
206
+ # A node that represents text, rather than markup.
207
+ class Text < Node #:nodoc:
208
+
209
+ attr_reader :content
210
+
211
+ # Creates a new text node as a child of the given parent, with the given
212
+ # content.
213
+ def initialize(parent, line, pos, content)
214
+ super(parent, line, pos)
215
+ @content = content
216
+ end
217
+
218
+ # Returns the content of this node.
219
+ def to_s
220
+ @content
221
+ end
222
+
223
+ # Returns +self+ if this node meets the given conditions. Text nodes support
224
+ # conditions of the following kinds:
225
+ #
226
+ # * if +conditions+ is a string, it must be a substring of the node's
227
+ # content
228
+ # * if +conditions+ is a regular expression, it must match the node's
229
+ # content
230
+ # * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
231
+ # is either a string or a regexp, and which is interpreted as described
232
+ # above.
233
+ def find(conditions)
234
+ match(conditions) && self
235
+ end
236
+
237
+ # Returns non-+nil+ if this node meets the given conditions, or +nil+
238
+ # otherwise. See the discussion of #find for the valid conditions.
239
+ def match(conditions)
240
+ case conditions
241
+ when String
242
+ @content.index(conditions)
243
+ when Regexp
244
+ @content =~ conditions
245
+ when Hash
246
+ conditions = validate_conditions(conditions)
247
+
248
+ # Text nodes only have :content, :parent, :ancestor
249
+ unless (conditions.keys - [:content, :parent, :ancestor]).empty?
250
+ return false
251
+ end
252
+
253
+ match(conditions[:content])
254
+ else
255
+ nil
256
+ end
257
+ end
258
+
259
+ def ==(node)
260
+ return false unless super
261
+ content == node.content
262
+ end
263
+ end
264
+
265
+ # A CDATA node is simply a text node with a specialized way of displaying
266
+ # itself.
267
+ class CDATA < Text #:nodoc:
268
+ def to_s
269
+ "<![CDATA[#{super}]>"
270
+ end
271
+ end
272
+
273
+ # A Tag is any node that represents markup. It may be an opening tag, a
274
+ # closing tag, or a self-closing tag. It has a name, and may have a hash of
275
+ # attributes.
276
+ class Tag < Node #:nodoc:
277
+
278
+ # Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
279
+ attr_reader :closing
280
+
281
+ # Either +nil+, or a hash of attributes for this node.
282
+ attr_reader :attributes
283
+
284
+ # The name of this tag.
285
+ attr_reader :name
286
+
287
+ # Create a new node as a child of the given parent, using the given content
288
+ # to describe the node. It will be parsed and the node name, attributes and
289
+ # closing status extracted.
290
+ def initialize(parent, line, pos, name, attributes, closing)
291
+ super(parent, line, pos)
292
+ @name = name
293
+ @attributes = attributes
294
+ @closing = closing
295
+ end
296
+
297
+ # A convenience for obtaining an attribute of the node. Returns +nil+ if
298
+ # the node has no attributes.
299
+ def [](attr)
300
+ @attributes ? @attributes[attr] : nil
301
+ end
302
+
303
+ # Returns non-+nil+ if this tag can contain child nodes.
304
+ def childless?(xml = false)
305
+ return false if xml && @closing.nil?
306
+ !@closing.nil? ||
307
+ @name =~ /^(img|br|hr|link|meta|area|base|basefont|
308
+ col|frame|input|isindex|param)$/ox
309
+ end
310
+
311
+ # Returns a textual representation of the node
312
+ def to_s
313
+ if @closing == :close
314
+ "</#{@name}>"
315
+ else
316
+ s = "<#{@name}"
317
+ @attributes.each do |k,v|
318
+ s << " #{k}"
319
+ s << "='#{v.gsub(/'/,"\\\\'")}'" if String === v
320
+ end
321
+ s << " /" if @closing == :self
322
+ s << ">"
323
+ @children.each { |child| s << child.to_s }
324
+ s << "</#{@name}>" if @closing != :self && !@children.empty?
325
+ s
326
+ end
327
+ end
328
+
329
+ # If either the node or any of its children meet the given conditions, the
330
+ # matching node is returned. Otherwise, +nil+ is returned. (See the
331
+ # description of the valid conditions in the +match+ method.)
332
+ def find(conditions)
333
+ match(conditions) && self || super
334
+ end
335
+
336
+ # Returns +true+, indicating that this node represents an HTML tag.
337
+ def tag?
338
+ true
339
+ end
340
+
341
+ # Returns +true+ if the node meets any of the given conditions. The
342
+ # +conditions+ parameter must be a hash of any of the following keys
343
+ # (all are optional):
344
+ #
345
+ # * <tt>:tag</tt>: the node name must match the corresponding value
346
+ # * <tt>:attributes</tt>: a hash. The node's values must match the
347
+ # corresponding values in the hash.
348
+ # * <tt>:parent</tt>: a hash. The node's parent must match the
349
+ # corresponding hash.
350
+ # * <tt>:child</tt>: a hash. At least one of the node's immediate children
351
+ # must meet the criteria described by the hash.
352
+ # * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
353
+ # meet the criteria described by the hash.
354
+ # * <tt>:descendant</tt>: a hash. At least one of the node's descendants
355
+ # must meet the criteria described by the hash.
356
+ # * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
357
+ # meet the criteria described by the hash.
358
+ # * <tt>:after</tt>: a hash. The node must be after any sibling meeting
359
+ # the criteria described by the hash, and at least one sibling must match.
360
+ # * <tt>:before</tt>: a hash. The node must be before any sibling meeting
361
+ # the criteria described by the hash, and at least one sibling must match.
362
+ # * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
363
+ # keys:
364
+ # ** <tt>:count</tt>: either a number or a range which must equal (or
365
+ # include) the number of children that match.
366
+ # ** <tt>:less_than</tt>: the number of matching children must be less than
367
+ # this number.
368
+ # ** <tt>:greater_than</tt>: the number of matching children must be
369
+ # greater than this number.
370
+ # ** <tt>:only</tt>: another hash consisting of the keys to use
371
+ # to match on the children, and only matching children will be
372
+ # counted.
373
+ #
374
+ # Conditions are matched using the following algorithm:
375
+ #
376
+ # * if the condition is a string, it must be a substring of the value.
377
+ # * if the condition is a regexp, it must match the value.
378
+ # * if the condition is a number, the value must match number.to_s.
379
+ # * if the condition is +true+, the value must not be +nil+.
380
+ # * if the condition is +false+ or +nil+, the value must be +nil+.
381
+ #
382
+ # Usage:
383
+ #
384
+ # # test if the node is a "span" tag
385
+ # node.match :tag => "span"
386
+ #
387
+ # # test if the node's parent is a "div"
388
+ # node.match :parent => { :tag => "div" }
389
+ #
390
+ # # test if any of the node's ancestors are "table" tags
391
+ # node.match :ancestor => { :tag => "table" }
392
+ #
393
+ # # test if any of the node's immediate children are "em" tags
394
+ # node.match :child => { :tag => "em" }
395
+ #
396
+ # # test if any of the node's descendants are "strong" tags
397
+ # node.match :descendant => { :tag => "strong" }
398
+ #
399
+ # # test if the node has between 2 and 4 span tags as immediate children
400
+ # node.match :children => { :count => 2..4, :only => { :tag => "span" } }
401
+ #
402
+ # # get funky: test to see if the node is a "div", has a "ul" ancestor
403
+ # # and an "li" parent (with "class" = "enum"), and whether or not it has
404
+ # # a "span" descendant that contains # text matching /hello world/:
405
+ # node.match :tag => "div",
406
+ # :ancestor => { :tag => "ul" },
407
+ # :parent => { :tag => "li",
408
+ # :attributes => { :class => "enum" } },
409
+ # :descendant => { :tag => "span",
410
+ # :child => /hello world/ }
411
+ def match(conditions)
412
+ conditions = validate_conditions(conditions)
413
+
414
+ # check content of child nodes
415
+ if conditions[:content]
416
+ if children.empty?
417
+ return false unless match_condition("", conditions[:content])
418
+ else
419
+ return false unless children.find { |child| child.match(conditions[:content]) }
420
+ end
421
+ end
422
+
423
+ # test the name
424
+ return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
425
+
426
+ # test attributes
427
+ (conditions[:attributes] || {}).each do |key, value|
428
+ return false unless match_condition(self[key], value)
429
+ end
430
+
431
+ # test parent
432
+ return false unless parent.match(conditions[:parent]) if conditions[:parent]
433
+
434
+ # test children
435
+ return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
436
+
437
+ # test ancestors
438
+ if conditions[:ancestor]
439
+ return false unless catch :found do
440
+ p = self
441
+ throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
442
+ end
443
+ end
444
+
445
+ # test descendants
446
+ if conditions[:descendant]
447
+ return false unless children.find do |child|
448
+ # test the child
449
+ child.match(conditions[:descendant]) ||
450
+ # test the child's descendants
451
+ child.match(:descendant => conditions[:descendant])
452
+ end
453
+ end
454
+
455
+ # count children
456
+ if opts = conditions[:children]
457
+ matches = children.select do |c|
458
+ c.match(/./) or
459
+ (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
460
+ end
461
+
462
+ matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
463
+ opts.each do |key, value|
464
+ next if key == :only
465
+ case key
466
+ when :count
467
+ if Integer === value
468
+ return false if matches.length != value
469
+ else
470
+ return false unless value.include?(matches.length)
471
+ end
472
+ when :less_than
473
+ return false unless matches.length < value
474
+ when :greater_than
475
+ return false unless matches.length > value
476
+ else raise "unknown count condition #{key}"
477
+ end
478
+ end
479
+ end
480
+
481
+ # test siblings
482
+ if conditions[:sibling] || conditions[:before] || conditions[:after]
483
+ siblings = parent ? parent.children : []
484
+ self_index = siblings.index(self)
485
+
486
+ if conditions[:sibling]
487
+ return false unless siblings.detect do |s|
488
+ s != self && s.match(conditions[:sibling])
489
+ end
490
+ end
491
+
492
+ if conditions[:before]
493
+ return false unless siblings[self_index+1..-1].detect do |s|
494
+ s != self && s.match(conditions[:before])
495
+ end
496
+ end
497
+
498
+ if conditions[:after]
499
+ return false unless siblings[0,self_index].detect do |s|
500
+ s != self && s.match(conditions[:after])
501
+ end
502
+ end
503
+ end
504
+
505
+ true
506
+ end
507
+
508
+ def ==(node)
509
+ return true if equal?(node)
510
+ return false unless super
511
+ return false unless closing == node.closing && self.name == node.name
512
+ attributes == node.attributes
513
+ end
514
+
515
+ private
516
+ # Match the given value to the given condition.
517
+ def match_condition(value, condition)
518
+ case condition
519
+ when String
520
+ value && value == condition
521
+ when Regexp
522
+ value && value.match(condition)
523
+ when Numeric
524
+ value == condition.to_s
525
+ when true
526
+ !value.nil?
527
+ when false, nil
528
+ value.nil?
529
+ else
530
+ false
531
+ end
532
+ end
533
+ end
534
+ end