scrapi 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,534 @@
1
+ require 'strscan'
2
+
3
+ module HTML #:nodoc:
4
+
5
+ class Conditions < Hash #:nodoc:
6
+ def initialize(hash)
7
+ super()
8
+ hash = { :content => hash } unless Hash === hash
9
+ hash = keys_to_symbols(hash)
10
+ hash.each do |k,v|
11
+ case k
12
+ when :tag, :content then
13
+ # keys are valid, and require no further processing
14
+ when :attributes then
15
+ hash[k] = keys_to_strings(v)
16
+ when :parent, :child, :ancestor, :descendant, :sibling, :before,
17
+ :after
18
+ hash[k] = Conditions.new(v)
19
+ when :children
20
+ hash[k] = v = keys_to_symbols(v)
21
+ v.each do |k,v2|
22
+ case k
23
+ when :count, :greater_than, :less_than
24
+ # keys are valid, and require no further processing
25
+ when :only
26
+ v[k] = Conditions.new(v2)
27
+ else
28
+ raise "illegal key #{k.inspect} => #{v2.inspect}"
29
+ end
30
+ end
31
+ else
32
+ raise "illegal key #{k.inspect} => #{v.inspect}"
33
+ end
34
+ end
35
+ update hash
36
+ end
37
+
38
+ private
39
+
40
+ def keys_to_strings(hash)
41
+ hash.keys.inject({}) do |h,k|
42
+ h[k.to_s] = hash[k]
43
+ h
44
+ end
45
+ end
46
+
47
+ def keys_to_symbols(hash)
48
+ hash.keys.inject({}) do |h,k|
49
+ raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
50
+ h[k.to_sym] = hash[k]
51
+ h
52
+ end
53
+ end
54
+ end
55
+
56
+ # The base class of all nodes, textual and otherwise, in an HTML document.
57
+ class Node #:nodoc:
58
+ # The array of children of this node. Not all nodes have children.
59
+ attr_reader :children
60
+
61
+ # The parent node of this node. All nodes have a parent, except for the
62
+ # root node.
63
+ attr_reader :parent
64
+
65
+ # The line number of the input where this node was begun
66
+ attr_reader :line
67
+
68
+ # The byte position in the input where this node was begun
69
+ attr_reader :position
70
+
71
+ # Create a new node as a child of the given parent.
72
+ def initialize(parent, line=0, pos=0)
73
+ @parent = parent
74
+ @children = []
75
+ @line, @position = line, pos
76
+ end
77
+
78
+ # Return a textual representation of the node.
79
+ def to_s
80
+ s = ""
81
+ @children.each { |child| s << child.to_s }
82
+ s
83
+ end
84
+
85
+ # Return false (subclasses must override this to provide specific matching
86
+ # behavior.) +conditions+ may be of any type.
87
+ def match(conditions)
88
+ false
89
+ end
90
+
91
+ # Search the children of this node for the first node for which #find
92
+ # returns non +nil+. Returns the result of the #find call that succeeded.
93
+ def find(conditions)
94
+ conditions = validate_conditions(conditions)
95
+
96
+ @children.each do |child|
97
+ node = child.find(conditions)
98
+ return node if node
99
+ end
100
+ nil
101
+ end
102
+
103
+ # Search for all nodes that match the given conditions, and return them
104
+ # as an array.
105
+ def find_all(conditions)
106
+ conditions = validate_conditions(conditions)
107
+
108
+ matches = []
109
+ matches << self if match(conditions)
110
+ @children.each do |child|
111
+ matches.concat child.find_all(conditions)
112
+ end
113
+ matches
114
+ end
115
+
116
+ # Returns +false+. Subclasses may override this if they define a kind of
117
+ # tag.
118
+ def tag?
119
+ false
120
+ end
121
+
122
+ def validate_conditions(conditions)
123
+ Conditions === conditions ? conditions : Conditions.new(conditions)
124
+ end
125
+
126
+ def ==(node)
127
+ return false unless self.class == node.class && children.size == node.children.size
128
+
129
+ equivalent = true
130
+
131
+ children.size.times do |i|
132
+ equivalent &&= children[i] == node.children[i]
133
+ end
134
+
135
+ equivalent
136
+ end
137
+
138
+ class <<self
139
+ def parse(parent, line, pos, content, strict=true)
140
+ if content !~ /^<\S/
141
+ Text.new(parent, line, pos, content)
142
+ else
143
+ scanner = StringScanner.new(content)
144
+
145
+ unless scanner.skip(/</)
146
+ if strict
147
+ raise "expected <"
148
+ else
149
+ return Text.new(parent, line, pos, content)
150
+ end
151
+ end
152
+
153
+ if scanner.skip(/!\[CDATA\[/)
154
+ scanner.scan_until(/\]\]>/)
155
+ return CDATA.new(parent, line, pos, scanner.pre_match)
156
+ end
157
+
158
+ closing = ( scanner.scan(/\//) ? :close : nil )
159
+ return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:]+/)
160
+ name.downcase!
161
+
162
+ unless closing
163
+ scanner.skip(/\s*/)
164
+ attributes = {}
165
+ while attr = scanner.scan(/[-\w:]+/)
166
+ value = true
167
+ if scanner.scan(/\s*=\s*/)
168
+ if delim = scanner.scan(/['"]/)
169
+ value = ""
170
+ while text = scanner.scan(/[^#{delim}\\]+|./)
171
+ case text
172
+ when "\\" then
173
+ value << text
174
+ value << scanner.getch
175
+ when delim
176
+ break
177
+ else value << text
178
+ end
179
+ end
180
+ else
181
+ value = scanner.scan(/[^\s>\/]+/)
182
+ end
183
+ end
184
+ attributes[attr.downcase] = value
185
+ scanner.skip(/\s*/)
186
+ end
187
+
188
+ closing = ( scanner.scan(/\//) ? :self : nil )
189
+ end
190
+
191
+ unless scanner.scan(/\s*>/)
192
+ if strict
193
+ raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
194
+ else
195
+ # throw away all text until we find what we're looking for
196
+ scanner.skip_until(/>/) or scanner.terminate
197
+ end
198
+ end
199
+
200
+ Tag.new(parent, line, pos, name, attributes, closing)
201
+ end
202
+ end
203
+ end
204
+ end
205
+
206
+ # A node that represents text, rather than markup.
207
+ class Text < Node #:nodoc:
208
+
209
+ attr_reader :content
210
+
211
+ # Creates a new text node as a child of the given parent, with the given
212
+ # content.
213
+ def initialize(parent, line, pos, content)
214
+ super(parent, line, pos)
215
+ @content = content
216
+ end
217
+
218
+ # Returns the content of this node.
219
+ def to_s
220
+ @content
221
+ end
222
+
223
+ # Returns +self+ if this node meets the given conditions. Text nodes support
224
+ # conditions of the following kinds:
225
+ #
226
+ # * if +conditions+ is a string, it must be a substring of the node's
227
+ # content
228
+ # * if +conditions+ is a regular expression, it must match the node's
229
+ # content
230
+ # * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
231
+ # is either a string or a regexp, and which is interpreted as described
232
+ # above.
233
+ def find(conditions)
234
+ match(conditions) && self
235
+ end
236
+
237
+ # Returns non-+nil+ if this node meets the given conditions, or +nil+
238
+ # otherwise. See the discussion of #find for the valid conditions.
239
+ def match(conditions)
240
+ case conditions
241
+ when String
242
+ @content.index(conditions)
243
+ when Regexp
244
+ @content =~ conditions
245
+ when Hash
246
+ conditions = validate_conditions(conditions)
247
+
248
+ # Text nodes only have :content, :parent, :ancestor
249
+ unless (conditions.keys - [:content, :parent, :ancestor]).empty?
250
+ return false
251
+ end
252
+
253
+ match(conditions[:content])
254
+ else
255
+ nil
256
+ end
257
+ end
258
+
259
+ def ==(node)
260
+ return false unless super
261
+ content == node.content
262
+ end
263
+ end
264
+
265
+ # A CDATA node is simply a text node with a specialized way of displaying
266
+ # itself.
267
+ class CDATA < Text #:nodoc:
268
+ def to_s
269
+ "<![CDATA[#{super}]>"
270
+ end
271
+ end
272
+
273
+ # A Tag is any node that represents markup. It may be an opening tag, a
274
+ # closing tag, or a self-closing tag. It has a name, and may have a hash of
275
+ # attributes.
276
+ class Tag < Node #:nodoc:
277
+
278
+ # Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
279
+ attr_reader :closing
280
+
281
+ # Either +nil+, or a hash of attributes for this node.
282
+ attr_reader :attributes
283
+
284
+ # The name of this tag.
285
+ attr_reader :name
286
+
287
+ # Create a new node as a child of the given parent, using the given content
288
+ # to describe the node. It will be parsed and the node name, attributes and
289
+ # closing status extracted.
290
+ def initialize(parent, line, pos, name, attributes, closing)
291
+ super(parent, line, pos)
292
+ @name = name
293
+ @attributes = attributes
294
+ @closing = closing
295
+ end
296
+
297
+ # A convenience for obtaining an attribute of the node. Returns +nil+ if
298
+ # the node has no attributes.
299
+ def [](attr)
300
+ @attributes ? @attributes[attr] : nil
301
+ end
302
+
303
+ # Returns non-+nil+ if this tag can contain child nodes.
304
+ def childless?(xml = false)
305
+ return false if xml && @closing.nil?
306
+ !@closing.nil? ||
307
+ @name =~ /^(img|br|hr|link|meta|area|base|basefont|
308
+ col|frame|input|isindex|param)$/ox
309
+ end
310
+
311
+ # Returns a textual representation of the node
312
+ def to_s
313
+ if @closing == :close
314
+ "</#{@name}>"
315
+ else
316
+ s = "<#{@name}"
317
+ @attributes.each do |k,v|
318
+ s << " #{k}"
319
+ s << "='#{v.gsub(/'/,"\\\\'")}'" if String === v
320
+ end
321
+ s << " /" if @closing == :self
322
+ s << ">"
323
+ @children.each { |child| s << child.to_s }
324
+ s << "</#{@name}>" if @closing != :self && !@children.empty?
325
+ s
326
+ end
327
+ end
328
+
329
+ # If either the node or any of its children meet the given conditions, the
330
+ # matching node is returned. Otherwise, +nil+ is returned. (See the
331
+ # description of the valid conditions in the +match+ method.)
332
+ def find(conditions)
333
+ match(conditions) && self || super
334
+ end
335
+
336
+ # Returns +true+, indicating that this node represents an HTML tag.
337
+ def tag?
338
+ true
339
+ end
340
+
341
+ # Returns +true+ if the node meets any of the given conditions. The
342
+ # +conditions+ parameter must be a hash of any of the following keys
343
+ # (all are optional):
344
+ #
345
+ # * <tt>:tag</tt>: the node name must match the corresponding value
346
+ # * <tt>:attributes</tt>: a hash. The node's values must match the
347
+ # corresponding values in the hash.
348
+ # * <tt>:parent</tt>: a hash. The node's parent must match the
349
+ # corresponding hash.
350
+ # * <tt>:child</tt>: a hash. At least one of the node's immediate children
351
+ # must meet the criteria described by the hash.
352
+ # * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
353
+ # meet the criteria described by the hash.
354
+ # * <tt>:descendant</tt>: a hash. At least one of the node's descendants
355
+ # must meet the criteria described by the hash.
356
+ # * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
357
+ # meet the criteria described by the hash.
358
+ # * <tt>:after</tt>: a hash. The node must be after any sibling meeting
359
+ # the criteria described by the hash, and at least one sibling must match.
360
+ # * <tt>:before</tt>: a hash. The node must be before any sibling meeting
361
+ # the criteria described by the hash, and at least one sibling must match.
362
+ # * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
363
+ # keys:
364
+ # ** <tt>:count</tt>: either a number or a range which must equal (or
365
+ # include) the number of children that match.
366
+ # ** <tt>:less_than</tt>: the number of matching children must be less than
367
+ # this number.
368
+ # ** <tt>:greater_than</tt>: the number of matching children must be
369
+ # greater than this number.
370
+ # ** <tt>:only</tt>: another hash consisting of the keys to use
371
+ # to match on the children, and only matching children will be
372
+ # counted.
373
+ #
374
+ # Conditions are matched using the following algorithm:
375
+ #
376
+ # * if the condition is a string, it must be a substring of the value.
377
+ # * if the condition is a regexp, it must match the value.
378
+ # * if the condition is a number, the value must match number.to_s.
379
+ # * if the condition is +true+, the value must not be +nil+.
380
+ # * if the condition is +false+ or +nil+, the value must be +nil+.
381
+ #
382
+ # Usage:
383
+ #
384
+ # # test if the node is a "span" tag
385
+ # node.match :tag => "span"
386
+ #
387
+ # # test if the node's parent is a "div"
388
+ # node.match :parent => { :tag => "div" }
389
+ #
390
+ # # test if any of the node's ancestors are "table" tags
391
+ # node.match :ancestor => { :tag => "table" }
392
+ #
393
+ # # test if any of the node's immediate children are "em" tags
394
+ # node.match :child => { :tag => "em" }
395
+ #
396
+ # # test if any of the node's descendants are "strong" tags
397
+ # node.match :descendant => { :tag => "strong" }
398
+ #
399
+ # # test if the node has between 2 and 4 span tags as immediate children
400
+ # node.match :children => { :count => 2..4, :only => { :tag => "span" } }
401
+ #
402
+ # # get funky: test to see if the node is a "div", has a "ul" ancestor
403
+ # # and an "li" parent (with "class" = "enum"), and whether or not it has
404
+ # # a "span" descendant that contains # text matching /hello world/:
405
+ # node.match :tag => "div",
406
+ # :ancestor => { :tag => "ul" },
407
+ # :parent => { :tag => "li",
408
+ # :attributes => { :class => "enum" } },
409
+ # :descendant => { :tag => "span",
410
+ # :child => /hello world/ }
411
+ def match(conditions)
412
+ conditions = validate_conditions(conditions)
413
+
414
+ # check content of child nodes
415
+ if conditions[:content]
416
+ if children.empty?
417
+ return false unless match_condition("", conditions[:content])
418
+ else
419
+ return false unless children.find { |child| child.match(conditions[:content]) }
420
+ end
421
+ end
422
+
423
+ # test the name
424
+ return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
425
+
426
+ # test attributes
427
+ (conditions[:attributes] || {}).each do |key, value|
428
+ return false unless match_condition(self[key], value)
429
+ end
430
+
431
+ # test parent
432
+ return false unless parent.match(conditions[:parent]) if conditions[:parent]
433
+
434
+ # test children
435
+ return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
436
+
437
+ # test ancestors
438
+ if conditions[:ancestor]
439
+ return false unless catch :found do
440
+ p = self
441
+ throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
442
+ end
443
+ end
444
+
445
+ # test descendants
446
+ if conditions[:descendant]
447
+ return false unless children.find do |child|
448
+ # test the child
449
+ child.match(conditions[:descendant]) ||
450
+ # test the child's descendants
451
+ child.match(:descendant => conditions[:descendant])
452
+ end
453
+ end
454
+
455
+ # count children
456
+ if opts = conditions[:children]
457
+ matches = children.select do |c|
458
+ c.match(/./) or
459
+ (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
460
+ end
461
+
462
+ matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
463
+ opts.each do |key, value|
464
+ next if key == :only
465
+ case key
466
+ when :count
467
+ if Integer === value
468
+ return false if matches.length != value
469
+ else
470
+ return false unless value.include?(matches.length)
471
+ end
472
+ when :less_than
473
+ return false unless matches.length < value
474
+ when :greater_than
475
+ return false unless matches.length > value
476
+ else raise "unknown count condition #{key}"
477
+ end
478
+ end
479
+ end
480
+
481
+ # test siblings
482
+ if conditions[:sibling] || conditions[:before] || conditions[:after]
483
+ siblings = parent ? parent.children : []
484
+ self_index = siblings.index(self)
485
+
486
+ if conditions[:sibling]
487
+ return false unless siblings.detect do |s|
488
+ s != self && s.match(conditions[:sibling])
489
+ end
490
+ end
491
+
492
+ if conditions[:before]
493
+ return false unless siblings[self_index+1..-1].detect do |s|
494
+ s != self && s.match(conditions[:before])
495
+ end
496
+ end
497
+
498
+ if conditions[:after]
499
+ return false unless siblings[0,self_index].detect do |s|
500
+ s != self && s.match(conditions[:after])
501
+ end
502
+ end
503
+ end
504
+
505
+ true
506
+ end
507
+
508
+ def ==(node)
509
+ return true if equal?(node)
510
+ return false unless super
511
+ return false unless closing == node.closing && self.name == node.name
512
+ attributes == node.attributes
513
+ end
514
+
515
+ private
516
+ # Match the given value to the given condition.
517
+ def match_condition(value, condition)
518
+ case condition
519
+ when String
520
+ value && value == condition
521
+ when Regexp
522
+ value && value.match(condition)
523
+ when Numeric
524
+ value == condition.to_s
525
+ when true
526
+ !value.nil?
527
+ when false, nil
528
+ value.nil?
529
+ else
530
+ false
531
+ end
532
+ end
533
+ end
534
+ end