assaf-scrapi 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,105 @@
1
+ require 'strscan'
2
+
3
+ module HTML #:nodoc:
4
+
5
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
6
+ # token is a string. Each string represents either "text", or an HTML element.
7
+ #
8
+ # This currently assumes valid XHTML, which means no free < or > characters.
9
+ #
10
+ # Usage:
11
+ #
12
+ # tokenizer = HTML::Tokenizer.new(text)
13
+ # while token = tokenizer.next
14
+ # p token
15
+ # end
16
+ class Tokenizer #:nodoc:
17
+
18
+ # The current (byte) position in the text
19
+ attr_reader :position
20
+
21
+ # The current line number
22
+ attr_reader :line
23
+
24
+ # Create a new Tokenizer for the given text.
25
+ def initialize(text)
26
+ @scanner = StringScanner.new(text)
27
+ @position = 0
28
+ @line = 0
29
+ @current_line = 1
30
+ end
31
+
32
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
33
+ # the stream.
34
+ def next
35
+ return nil if @scanner.eos?
36
+ @position = @scanner.pos
37
+ @line = @current_line
38
+ if @scanner.check(/<\S/)
39
+ update_current_line(scan_tag)
40
+ else
41
+ update_current_line(scan_text)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ # Treat the text at the current position as a tag, and scan it. Supports
48
+ # comments, doctype tags, and regular tags, and ignores less-than and
49
+ # greater-than characters within quoted strings.
50
+ def scan_tag
51
+ tag = @scanner.getch
52
+ if @scanner.scan(/!--/) # comment
53
+ tag << @scanner.matched
54
+ tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
55
+ elsif @scanner.scan(/!\[CDATA\[/)
56
+ tag << @scanner.matched
57
+ tag << @scanner.scan_until(/\]\]>/)
58
+ elsif @scanner.scan(/!/) # doctype
59
+ tag << @scanner.matched
60
+ tag << consume_quoted_regions
61
+ else
62
+ tag << consume_quoted_regions
63
+ end
64
+ tag
65
+ end
66
+
67
+ # Scan all text up to the next < character and return it.
68
+ def scan_text
69
+ "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
70
+ end
71
+
72
+ # Counts the number of newlines in the text and updates the current line
73
+ # accordingly.
74
+ def update_current_line(text)
75
+ text.scan(/\r?\n/) { @current_line += 1 }
76
+ end
77
+
78
+ # Skips over quoted strings, so that less-than and greater-than characters
79
+ # within the strings are ignored.
80
+ def consume_quoted_regions
81
+ text = ""
82
+ loop do
83
+ match = @scanner.scan_until(/['"<>]/) or break
84
+
85
+ delim = @scanner.matched
86
+ if delim == "<"
87
+ match = match.chop
88
+ @scanner.pos -= 1
89
+ end
90
+
91
+ text << match
92
+ break if delim == "<" || delim == ">"
93
+
94
+ # consume the quoted region
95
+ while match = @scanner.scan_until(/[\\#{delim}]/)
96
+ text << match
97
+ break if @scanner.matched == delim
98
+ text << @scanner.getch # skip the escaped character
99
+ end
100
+ end
101
+ text
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,11 @@
1
+ module HTML #:nodoc:
2
+ module Version #:nodoc:
3
+
4
+ MAJOR = 0
5
+ MINOR = 5
6
+ TINY = 3
7
+
8
+ STRING = [ MAJOR, MINOR, TINY ].join(".")
9
+
10
+ end
11
+ end
@@ -0,0 +1,990 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require File.join(File.dirname(__FILE__), "reader")
10
+
11
+
12
+ module Scraper
13
+
14
+ class Base
15
+
16
+
17
+ # Information about the HTML page scraped. A structure with the following
18
+ # attributes:
19
+ # * <tt>url</tt> -- The URL of the document being scraped. Passed in
20
+ # the constructor but may have changed if the page was redirected.
21
+ # * <tt>original_url</tt> -- The original URL of the document being
22
+ # scraped as passed in the constructor.
23
+ # * <tt>encoding</tt> -- The encoding of the document.
24
+ # * <tt>last_modified</tt> -- Value of the Last-Modified header returned
25
+ # from the server.
26
+ # * <tt>etag</tt> -- Value of the Etag header returned from the server.
27
+ PageInfo = Struct.new(:url, :original_url, :encoding, :last_modified, :etag)
28
+
29
+
30
+ class << self
31
+
32
+ # :call-seq:
33
+ # process(symbol?, selector, values?, extractor)
34
+ # process(symbol?, selector, values?) { |element| ... }
35
+ #
36
+ # Defines a processing rule. A processing rule consists of a selector
37
+ # that matches element, and an extractor that does something interesting
38
+ # with their value.
39
+ #
40
+ # == Symbol
41
+ #
42
+ # Rules are processed in the order in which they are defined. Use #rules
43
+ # if you need to change the order of processing.
44
+ #
45
+ # Rules can be named or anonymous. If the first argument is a symbol,
46
+ # it is used as the rule name. You can use the rule name to position,
47
+ # remove or replace it.
48
+ #
49
+ # == Selector
50
+ #
51
+ # The first argument is a selector. It selects elements from the document
52
+ # that are potential candidates for extraction. Each selected element is
53
+ # passed to the extractor.
54
+ #
55
+ # The +selector+ argument may be a string, an HTML::Selector object or
56
+ # any object that responds to the +select+ method. Passing an Array
57
+ # (responds to +select+) will not do anything useful.
58
+ #
59
+ # String selectors support value substitution, replacing question marks
60
+ # (?) in the selector expression with values from the method arguments.
61
+ # See HTML::Selector for more information.
62
+ #
63
+ # == Extractor
64
+ #
65
+ # The last argument or block is the extractor. The extractor does
66
+ # something interested with the selected element, typically assigns
67
+ # it to an instance variable of the scraper.
68
+ #
69
+ # Since the extractor is called on the scraper, it can also use the
70
+ # scraper to maintain state, e.g. this extractor counts how many
71
+ # +div+ elements appear in the document:
72
+ # process "div" { |element| @count += 1 }
73
+ #
74
+ # The extractor returns +true+ if the element was processed and
75
+ # should not be passed to any other extractor (including any child
76
+ # elements).
77
+ #
78
+ # The default implementation of #result returns +self+ only if at
79
+ # least one extractor returned +true+. However, you can override
80
+ # #result and use extractors that return +false+.
81
+ #
82
+ # A block extractor is called with a single element.
83
+ #
84
+ # You can also use the #extractor method to create extractors that
85
+ # assign elements, attributes and text values to instance variables,
86
+ # or pass a +Hash+ as the last argument to #process. See #extractor
87
+ # for more information.
88
+ #
89
+ # When using a block, the last statement is the response. Do not use
90
+ # +return+, use +next+ if you want to return a value before the last
91
+ # statement. +return+ does not do what you expect it to.
92
+ #
93
+ # == Example
94
+ #
95
+ # class ScrapePosts < Scraper::Base
96
+ # # Select the title of a post
97
+ # selector :select_title, "h2"
98
+ #
99
+ # # Select the body of a post
100
+ # selector :select_body, ".body"
101
+ #
102
+ # # All elements with class name post.
103
+ # process ".post" do |element|
104
+ # title = select_title(element)
105
+ # body = select_body(element)
106
+ # @posts << Post.new(title, body)
107
+ # true
108
+ # end
109
+ #
110
+ # attr_reader :posts
111
+ # end
112
+ #
113
+ # posts = ScrapePosts.scrape(html).posts
114
+ #
115
+ # To process only a single element:
116
+ #
117
+ # class ScrapeTitle < Scraper::Base
118
+ # process "html>head>title", :title=>text
119
+ # result :title
120
+ # end
121
+ #
122
+ # puts ScrapeTitle.scrape(html)
123
+ def process(*selector, &block)
124
+ create_process(false, *selector, &block)
125
+ end
126
+
127
+
128
+ # Similar to #process, but only extracts from the first
129
+ # selected element. Faster if you know the document contains
130
+ # only one applicable element, or only interested in processing
131
+ # the first one.
132
+ def process_first(*selector, &block)
133
+ create_process(true, *selector, &block)
134
+ end
135
+
136
+
137
+ # :call-seq:
138
+ # selector(symbol, selector, values?)
139
+ # selector(symbol, selector, values?) { |elements| ... }
140
+ #
141
+ # Create a selector method. You can call a selector method directly
142
+ # to select elements.
143
+ #
144
+ # For example, define a selector:
145
+ # selector :five_divs, "div" { |elems| elems[0..4] }
146
+ # And call it to retrieve the first five +div+ elements:
147
+ # divs = five_divs(element)
148
+ #
149
+ # Call a selector method with an element and it returns an array of
150
+ # elements that match the selector, beginning with the element argument
151
+ # itself. It returns an empty array if nothing matches.
152
+ #
153
+ # If the selector is defined with a block, all selected elements are
154
+ # passed to the block and the result of the block is returned.
155
+ #
156
+ # For convenience, a <tt>first_</tt> method is also created that
157
+ # returns (and yields) only the first selected element. For example:
158
+ # selector :post, "#post"
159
+ # @post = first_post
160
+ #
161
+ # Since the selector is defined with a block, both methods call that
162
+ # block with an array of elements.
163
+ #
164
+ # The +selector+ argument may be a string, an HTML::Selector object or
165
+ # any object that responds to the +select+ method. Passing an Array
166
+ # (responds to +select+) will not do anything useful.
167
+ #
168
+ # String selectors support value substitution, replacing question marks
169
+ # (?) in the selector expression with values from the method arguments.
170
+ # See HTML::Selector for more information.
171
+ #
172
+ # When using a block, the last statement is the response. Do not use
173
+ # +return+, use +next+ if you want to return a value before the last
174
+ # statement. +return+ does not do what you expect it to.
175
+ def selector(symbol, *selector, &block)
176
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
177
+ if selector[0].is_a?(String)
178
+ selector = HTML::Selector.new(*selector)
179
+ else
180
+ raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
181
+ selector = selector[0]
182
+ end
183
+ if block
184
+ define_method symbol do |element|
185
+ selected = selector.select(element)
186
+ return block.call(selected) unless selected.empty?
187
+ end
188
+ define_method "first_#{symbol}" do |element|
189
+ selected = selector.select_first(element)
190
+ return block.call([selected]) if selected
191
+ end
192
+ else
193
+ define_method symbol do |element|
194
+ return selector.select(element)
195
+ end
196
+ define_method "first_#{symbol}" do |element|
197
+ return selector.select_first(element)
198
+ end
199
+ end
200
+ end
201
+
202
+
203
+ # Creates an extractor that will extract values from the selected
204
+ # element and place them in instance variables of the scraper.
205
+ # You can pass the result to #process.
206
+ #
207
+ # == Example
208
+ #
209
+ # This example processes a document looking for an element with the
210
+ # class name +article+. It extracts the attribute +id+ and stores it
211
+ # in the instance variable +@id+. It extracts the article node itself
212
+ # and puts it in the instance variable +@article+.
213
+ #
214
+ # class ArticleScraper < Scraper::Base
215
+ # process ".article", extractor(:id=>"@id", :article=>:element)
216
+ # attr_reader :id, :article
217
+ # end
218
+ # result = ArticleScraper.scrape(html)
219
+ # puts result.id
220
+ # puts result.article
221
+ #
222
+ # == Sources
223
+ #
224
+ # Extractors operate on the selected element, and can extract the
225
+ # following values:
226
+ # * <tt>"elem_name"</tt> -- Extracts the element itself if it
227
+ # matches the element name (e.g. "h2" will extract only level 2
228
+ # header elements).
229
+ # * <tt>"attr_name"</tt> -- Extracts the attribute value from the
230
+ # element if specified (e.g. "@id" will extract the id attribute).
231
+ # * <tt>"elem_name@attr_name"</tt> -- Extracts the attribute value
232
+ # from the element if specified, but only if the element has the
233
+ # specified name (e.g. "h2@id").
234
+ # * <tt>:element</tt> -- Extracts the element itself.
235
+ # * <tt>:text</tt> -- Extracts the text value of the node.
236
+ # * <tt>Scraper</tt> -- Using this class creates a scraper to
237
+ # process the current element and extract the result. This can
238
+ # be used for handling complex structure.
239
+ #
240
+ # If you use an array of sources, the first source that matches
241
+ # anything is used. For example, <tt>["attr@title", :text]</tt>
242
+ # extracts the value of the +title+ attribute if the element is
243
+ # +abbr+, otherwise the text value of the element.
244
+ #
245
+ # If you use a hash, you can extract multiple values at the same
246
+ # time. For example, <tt>{:id=>"@id", :class=>"@class"}</tt>
247
+ # extracts the +id+ and +class+ attribute values.
248
+ #
249
+ # :element and :text are special cases of symbols. You can pass any
250
+ # symbol that matches a class method and that class method will
251
+ # be called to extract a value from the selected element.
252
+ # You can also pass a Proc or Method directly.
253
+ #
254
+ # And it's always possible to pass a static value, quite useful for
255
+ # processing an element with more than one rule (<tt>:skip=>false</tt>).
256
+ #
257
+ # == Targets
258
+ #
259
+ # Extractors assign the extracted value to an instance variable
260
+ # of the scraper. The instance variable contains the last value
261
+ # extracted.
262
+ #
263
+ # Also creates an accessor for that instance variable. An accessor
264
+ # is created if no such method exists. For example,
265
+ # <tt>:title=>:text</tt> creates an accessor for +title+. However,
266
+ # <tt>:id=>"@id"</tt> does not create an accessor since each
267
+ # object already has a method called +id+.
268
+ #
269
+ # If you want to extract multiple values into the same variables,
270
+ # use #array to declare that accessor as an array.
271
+ #
272
+ # Alternatively, you can append <tt>[]</tt> to the variable name.
273
+ # For example:
274
+ # process "*", "ids[]"=>"@id"
275
+ # result :ids
276
+ #
277
+ # The special target <tt>:skip</tt> allows you to control whether
278
+ # other rules can apply to the same element. By default a processing
279
+ # rule without a block (or a block that returns true) will skip
280
+ # that element so no other processing rule sees it.
281
+ #
282
+ # You can change this with <tt>:skip=>false</tt>.
283
+ def extractor(map)
284
+ extracts = []
285
+ map.each_pair do |target, source|
286
+ source = extract_value_from(source)
287
+ target = extract_value_to(target)
288
+ define_method :__extractor do |element|
289
+ value = source.call(element)
290
+ target.call(self, value) if !value.nil?
291
+ end
292
+ extracts << instance_method(:__extractor)
293
+ remove_method :__extractor
294
+ end
295
+ lambda do |element|
296
+ extracts.each do |extract|
297
+ extract.bind(self).call(element)
298
+ end
299
+ true
300
+ end
301
+ end
302
+
303
+
304
+ # Scrapes the document and returns the result.
305
+ #
306
+ # The first argument provides the input document. It can be one of:
307
+ # * <tt>URI</tt> -- Retrieve an HTML page from this URL and
308
+ # scrape it.
309
+ # * <tt>String</tt> -- The HTML page as a string.
310
+ # * <tt>HTML::Node</tt> -- An HTML node, can be a document
311
+ # or element.
312
+ #
313
+ # You can specify options for the scraper class, or override
314
+ # these by passing options in the second argument. Some options
315
+ # only make sense in the constructor.
316
+ #
317
+ # The following options are supported for reading HTML pages:
318
+ # * <tt>:last_modified</tt> -- Last-Modified header used for
319
+ # caching.
320
+ # * <tt>:etag</tt> -- ETag header used for caching.
321
+ # * <tt>:redirect_limit</tt> -- Limits number of redirects
322
+ # to follow.
323
+ # * <tt>:user_agent</tt> -- Value for User-Agent header.
324
+ # * <tt>:timeout</tt> -- HTTP open connection/read timeouts
325
+ # (in second).
326
+ #
327
+ # The following options are supported for parsing the HTML:
328
+ # * <tt>:root_element</tt> -- The root element to scrape, see
329
+ # also #root_elements.
330
+ # * <tt>:parser</tt> -- Specifies which parser to use.
331
+ # (Typically, you set this for the class).
332
+ # * <tt>:parser_options</tt> -- Options to pass to the parser.
333
+ #
334
+ # The result is returned by calling the #result method.
335
+ # The default implementation returns +self+ if any extractor
336
+ # returned true, +nil+ otherwise.
337
+ #
338
+ # For example:
339
+ # result = MyScraper.scrape(url, :root_element=>"body")
340
+ #
341
+ # The method may raise any number of exceptions. HTTPError
342
+ # indicates it failed to retrieve the HTML page, and HTMLParseError
343
+ # that it failed to parse the page. Other exceptions come from
344
+ # extractors and the #result method.
345
+ def scrape(source, options = nil)
346
+ scraper = self.new(source, options);
347
+ return scraper.scrape
348
+ end
349
+
350
+
351
+ # Returns the text of the element.
352
+ #
353
+ # You can use this method from an extractor, e.g.:
354
+ # process "title", :title=>:text
355
+ def text(element)
356
+ text = ""
357
+ stack = element.children.reverse
358
+ while node = stack.pop
359
+ if node.tag?
360
+ stack.concat node.children.reverse
361
+ else
362
+ text << node.content
363
+ end
364
+ end
365
+ return text
366
+ end
367
+
368
+
369
+ # Returns the element itself.
370
+ #
371
+ # You can use this method from an extractor, e.g.:
372
+ # process "h1", :header=>:element
373
+ def element(element)
374
+ element
375
+ end
376
+
377
+
378
+ # Specifies which parser to use. The default is +:tidy+.
379
+ def parser(name = :tidy)
380
+ self.options[:parser] = name
381
+ end
382
+
383
+
384
+ # Options to pass to the parser.
385
+ #
386
+ # For example, when using Tidy, you can use these options to
387
+ # tell Tidy how to clean up the HTML.
388
+ #
389
+ # This method sets the option for the class. Classes inherit options
390
+ # from their parents. You can also pass options to the scraper object
391
+ # itself using the +:parser_options+ option.
392
+ def parser_options(options)
393
+ self.options[:parser_options] = options
394
+ end
395
+
396
+
397
+ # The root element to scrape.
398
+ #
399
+ # The root element for an HTML document is +html+. However, if you want
400
+ # to scrape only the header or body, you can set the root_element to
401
+ # +head+ or +body+.
402
+ #
403
+ # This method sets the root element for the class. Classes inherit
404
+ # this option from their parents. You can also pass a root element
405
+ # to the scraper object itself using the +:root_element+ option.
406
+ def root_element(name)
407
+ self.options[:root_element] = name ? name.to_s : nil
408
+ end
409
+
410
+
411
+ # Returns the options for this class.
412
+ def options()
413
+ @options ||= {}
414
+ end
415
+
416
+
417
+ # Returns an array of rules defined for this class. You can use this
418
+ # array to change the order of rules.
419
+ def rules()
420
+ @rules ||= []
421
+ end
422
+
423
+
424
+ # Modifies this scraper to return a single value or a structure.
425
+ # Use in combination with accessors.
426
+ #
427
+ # When called with one symbol, scraping returns the result of
428
+ # calling that method (typically an accessor). When called with
429
+ # two or more symbols, scraping returns a structure of values,
430
+ # one for each symbol.
431
+ #
432
+ # For example:
433
+ # class ScrapeTitle < Scraper::Base
434
+ # process_first "html>head>title", :title=>:text
435
+ # result :title
436
+ # end
437
+ #
438
+ # puts "Title: " + ScrapeTitle.scrape(html)
439
+ #
440
+ # class ScrapeDts < Scraper::Base
441
+ # process ".dtstart", :dtstart=>["abbr@title", :text]
442
+ # process ".dtend", :dtend=>["abbr@title", :text]
443
+ # result :dtstart, :dtend
444
+ # end
445
+ #
446
+ # dts = ScrapeDts.scrape(html)
447
+ # puts "Starts: #{dts.dtstart}"
448
+ # puts "Ends: #{dts.dtend}"
449
+ def result(*symbols)
450
+ raise ArgumentError, "Use one symbol to return the value of this accessor, multiple symbols to returns a structure" if symbols.empty?
451
+ symbols = symbols.map {|s| s.to_sym}
452
+ if symbols.size == 1
453
+ define_method :result do
454
+ return self.send(symbols[0])
455
+ end
456
+ else
457
+ struct = Struct.new(*symbols)
458
+ define_method :result do
459
+ return struct.new(*symbols.collect {|s| self.send(s) })
460
+ end
461
+ end
462
+ end
463
+
464
+
465
+ # Declares which accessors are arrays. You can declare the
466
+ # accessor here, or use "symbol[]" as the target.
467
+ #
468
+ # For example:
469
+ # array :urls
470
+ # process "a[href]", :urls=>"@href"
471
+ # Is equivalent to:
472
+ # process "a[href]", "urls[]"=>"@href"
473
+ def array(*symbols)
474
+ @arrays ||= []
475
+ symbols.each do |symbol|
476
+ symbol = symbol.to_sym
477
+ @arrays << symbol
478
+ begin
479
+ self.instance_method(symbol)
480
+ rescue NameError
481
+ attr_accessor symbol
482
+ end
483
+ end
484
+ end
485
+
486
+
487
+ private
488
+
489
+
490
+ # Called by #process and #process_first, see there for
491
+ # documentation. First argument indicates whether to
492
+ # process only the first matching element (+true+) or
493
+ # all matching elements (+false+).
494
+ def create_process(first, *selector, &block)
495
+ # First argument may be the rule name.
496
+ name = selector.shift if selector.first.is_a?(Symbol)
497
+ # Extractor is either a block, last argument or both.
498
+ if selector.last.is_a?(Proc)
499
+ extractor = selector.pop
500
+ elsif selector.last.is_a?(Hash)
501
+ extractor = extractor(selector.pop)
502
+ end
503
+ if block && extractor
504
+ # Ugly, but no other way to chain two calls bound to the
505
+ # scraper instance.
506
+ define_method :__extractor, extractor
507
+ extractor1 = instance_method(:__extractor)
508
+ define_method :__extractor, block
509
+ extractor2 = instance_method(:__extractor)
510
+ remove_method :__extractor
511
+ extractor = lambda do |element|
512
+ extractor1.bind(self).call(element)
513
+ extractor2.bind(self).call(element)
514
+ end
515
+ elsif block
516
+ extractor = block
517
+ end
518
+ raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
519
+ # And if we think the extractor is the last argument,
520
+ # it's certainly not the selector.
521
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
522
+ if selector[0].is_a?(String)
523
+ selector = HTML::Selector.new(*selector)
524
+ else
525
+ raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
526
+ selector = selector[0]
527
+ end
528
+ # Create a method for fast evaluation.
529
+ define_method :__extractor, extractor
530
+ method = instance_method(:__extractor)
531
+ remove_method :__extractor
532
+ # Decide where to put the rule.
533
+ pos = rules.length
534
+ if name
535
+ if find = rules.find {|rule| rule[2] == name }
536
+ find[0] = selector
537
+ find[1] = method
538
+ else
539
+ rules << [selector, method, name, first]
540
+ end
541
+ else
542
+ rules << [selector, method, name, first]
543
+ end
544
+ end
545
+
546
+
547
+ # Returns a Proc that will extract a value from an element.
548
+ #
549
+ # The +source+ argument specifies which value to extract.
550
+ # See #extractor for more details.
551
+ #
552
+ # The Proc is called with an element and returns a value
553
+ # or +nil+.
554
+ def extract_value_from(source)
555
+ case source
556
+ when Array
557
+ # For an array, each item is itself a source argument.
558
+ # We stop at the first value we're able to extract.
559
+ array = source.collect { |i| extract_value_from(i) }
560
+ return lambda do |element|
561
+ result = nil
562
+ array.each { |proc| break if result = proc.call(element) }
563
+ result
564
+ end
565
+ when Hash
566
+ # For a hash, each pair is a symbol and source argument.
567
+ # We extract all the values and set them in the hash.
568
+ hash = source.inject({}) { |h,p| h[p[0]] = extract_value_from(p[1]) ; h }
569
+ return lambda do |element|
570
+ result = {}
571
+ hash.each_pair do |source, target|
572
+ if value = target.call(element)
573
+ result[source] = value
574
+ end
575
+ end
576
+ result unless result.empty?
577
+ end
578
+ when Class
579
+ # A class is a scraper we run on the extracted element.
580
+ # It must extend Scraper::Base.
581
+ klass = source
582
+ while klass = klass.superclass
583
+ break if klass == Scraper::Base
584
+ end
585
+ raise ArgumentError, "Class must be a scraper that extends Scraper::Base" unless klass
586
+ return lambda { |element| source.new(element).scrape }
587
+ when Symbol
588
+ # A symbol is a method we call. We pass it the element
589
+ # and it returns the extracted value. It must be a class method.
590
+ method = method(source) rescue
591
+ raise(ArgumentError, "No method #{source} in #{self.class}")
592
+ return lambda { |element| method.call(element) }
593
+ when Proc, Method
594
+ # Self evident.
595
+ raise ArgumentError, "Proc or Method must take one argument (an element)" if source.arity == 0
596
+ return source
597
+ when /^[\w\-:]+$/
598
+ # An element name. Return the element if the name matches.
599
+ return lambda { |element| element if element.name == source }
600
+ when /^@[\w\-:]+$/
601
+ # An attribute name. Return its value if the attribute is specified.
602
+ attr_name = source[1..-1]
603
+ return lambda { |element| element.attributes[attr_name] }
604
+ when /^[\w\-:]+@[\w\-:]+$/
605
+ # An element with attribute name. Return the attribute value if
606
+ # the attribute is specified, and the element name matches.
607
+ tag_name, attr_name = source.match(/^([\w\-:]+)@([\w\-:]+)$/)[1..2]
608
+ return lambda do |element|
609
+ element.attributes[attr_name] if element.name == tag_name
610
+ end
611
+ else
612
+ return lambda { |element| source }
613
+ # Anything else and pianos fall from the sky.
614
+ raise ArgumentError, "Invalid extractor #{source.to_s}"
615
+ end
616
+ end
617
+
618
+
619
+ # Returns a Proc that will set the extract value in the object.
620
+ #
621
+ # The +target+ argument identifies an instance variable. It may
622
+ # be the name of a variable, or the name of a variable prefixed
623
+ # with [] to denote an array.
624
+ #
625
+ # The Proc is called with two arguments: the object to set the
626
+ # value in, and the value.
627
+ def extract_value_to(target)
628
+ if target.is_a?(Array)
629
+ setters = target.collect do |target|
630
+ [target,extract_value_to(target)]
631
+ end
632
+ return lambda do |object,value|
633
+ setters.each do |setter|
634
+ setter[1].call(object, value.send(setter[0]))
635
+ end
636
+ end
637
+ end
638
+
639
+ if target.to_sym == :skip
640
+ return lambda do |object, value|
641
+ object.send(:skip, value)
642
+ end
643
+ end
644
+
645
+ target = target.to_s
646
+ if target[-2..-1] == "[]" or (@arrays && array = @arrays.include?(target.to_sym))
647
+ target = target[0...-2] unless array
648
+ # Create an attribute accessor is not already defined.
649
+ begin
650
+ self.instance_method(target)
651
+ rescue NameError
652
+ attr_accessor target
653
+ end
654
+ reader = "#{target}".to_sym
655
+ writer = "#{target}=".to_sym
656
+ return lambda do |object, value|
657
+ array = object.send(reader)
658
+ object.send(writer, array = []) unless array
659
+ array << value
660
+ end
661
+ else
662
+ # Create an attribute accessor is not already defined.
663
+ begin
664
+ self.instance_method(target)
665
+ rescue NameError
666
+ instance = "@#{target}".to_sym
667
+ attr_accessor target
668
+ end
669
+ reader = "#{target}=".to_sym
670
+ return lambda { |object, value| object.send(reader, value) }
671
+ end
672
+ end
673
+
674
+
675
+ def inherited(child)
676
+ super
677
+ # Duplicate options, rules and arrays rules to any inherited class.
678
+ child.options.update self.options
679
+ child.rules.concat self.rules
680
+ child.instance_variable_set :@arrays, self.instance_variable_get(:@arrays)
681
+ end
682
+
683
+ end
684
+
685
+
686
+ unless const_defined? :READER_OPTIONS
687
+ READER_OPTIONS = [:last_modified, :etag, :redirect_limit, :user_agent, :timeout]
688
+ end
689
+
690
+
691
+ # Set to true when the first extractor returns true.
692
+ attr_accessor :extracted
693
+
694
+
695
+ # Information about the HTML page scraped. See PageInfo.
696
+ attr_accessor :page_info
697
+
698
+
699
+ # Returns the options for this object.
700
+ attr_accessor :options
701
+
702
+
703
+ # Create a new scraper instance.
704
+ #
705
+ # The argument +source+ is a URL, string containing HTML, or HTML::Node.
706
+ # The optional argument +options+ are options passed to the scraper.
707
+ # See Base#scrape for more details.
708
+ #
709
+ # For example:
710
+ # # The page we want to scrape
711
+ # url = URI.parse("http://example.com")
712
+ # # Skip the header
713
+ # scraper = MyScraper.new(url, :root_element=>"body")
714
+ # result = scraper.scrape
715
+ def initialize(source, options = nil)
716
+ @page_info = PageInfo[]
717
+ @options = options || {}
718
+ case source
719
+ when URI
720
+ @document = source
721
+ when String, HTML::Node
722
+ @document = source
723
+ # TODO: document and test case these two.
724
+ @page_info.url = @page_info.original_url = @options[:url]
725
+ @page_info.encoding = @options[:encoding]
726
+ else
727
+ raise ArgumentError, "Can only scrape URI, String or HTML::Node"
728
+ end
729
+ end
730
+
731
+
732
+ # Scrapes the document and returns the result.
733
+ #
734
+ # If the scraper was created with a URL, retrieve the page and parse it.
735
+ # If the scraper was created with a string, parse the page.
736
+ #
737
+ # The result is returned by calling the #result method. The default
738
+ # implementation returns +self+ if any extractor returned true,
739
+ # +nil+ otherwise.
740
+ #
741
+ # The method may raise any number of exceptions. HTTPError indicates
742
+ # it failed to retrieve the HTML page, and HTMLParseError that it failed
743
+ # to parse the page. Other exceptions come from extractors and the
744
+ # #result method.
745
+ #
746
+ # See also Base#scrape.
747
+ def scrape()
748
+ # Call prepare with the document, but before doing anything else.
749
+ prepare document
750
+ # Retrieve the document. This may raise HTTPError or HTMLParseError.
751
+ case document
752
+ when Array
753
+ stack = @document.reverse # see below
754
+ when HTML::Node
755
+ # If a root element is specified, start selecting from there.
756
+ # The stack is empty if we can't find any root element (makes
757
+ # sense). However, the node we're going to process may be
758
+ # a tag, or an HTML::Document.root which is the equivalent of
759
+ # a document fragment.
760
+ root_element = option(:root_element)
761
+ root = root_element ? @document.find(:tag=>root_element) : @document
762
+ stack = root ? (root.tag? ? [root] : root.children.reverse) : []
763
+ else
764
+ return
765
+ end
766
+ # @skip stores all the elements we want to skip (see #skip).
767
+ # rules stores all the rules we want to process with this
768
+ # scraper, based on the class definition.
769
+ @skip = []
770
+ @stop = false
771
+ rules = self.class.rules.clone
772
+ begin
773
+ # Process the document one node at a time. We process elements
774
+ # from the end of the stack, so each time we visit child elements,
775
+ # we add them to the end of the stack in reverse order.
776
+ while node = stack.pop
777
+ break if @stop
778
+ skip_this = false
779
+ # Only match nodes that are elements, ignore text nodes.
780
+ # Also ignore any element that's on the skip list, and if
781
+ # found one, remove it from the list (since we never visit
782
+ # the same element twice). But an element may be added twice
783
+ # to the skip list.
784
+ # Note: equal? is faster than == for nodes.
785
+ next unless node.tag?
786
+ @skip.delete_if { |s| skip_this = true if s.equal?(node) }
787
+ next if skip_this
788
+
789
+ # Run through all the rules until we process the element or
790
+ # run out of rules. If skip_this=true then we processed the
791
+ # element and we can break out of the loop. However, we might
792
+ # process (and skip) descedants so also watch the skip list.
793
+ rules.delete_if do |selector, extractor, rule_name, first_only|
794
+ break if skip_this
795
+ # The result of calling match (selected) is nil, element
796
+ # or array of elements. We turn it into an array to
797
+ # process one element at a time. We process all elements
798
+ # that are not on the skip list (we haven't visited
799
+ # them yet).
800
+ if selected = selector.match(node, first_only)
801
+ selected = [selected] unless selected.is_a?(Array)
802
+ selected = [selected.first] if first_only
803
+ selected.each do |element|
804
+ # Do not process elements we already skipped
805
+ # (see above). However, this time we may visit
806
+ # an element twice, since selected elements may
807
+ # be descendants of the current element on the
808
+ # stack. In rare cases two elements on the stack
809
+ # may pick the same descendants.
810
+ next if @skip.find { |s| s.equal?(element) }
811
+ # Call the extractor method with this element.
812
+ # If it returns true, skip the element and if
813
+ # the current element, don't process any more
814
+ # rules. Again, pay attention to descendants.
815
+ if extractor.bind(self).call(element)
816
+ @extracted = true
817
+ end
818
+ if @skip.delete(true)
819
+ if element.equal?(node)
820
+ skip_this = true
821
+ else
822
+ @skip << element
823
+ end
824
+ end
825
+ end
826
+ first_only if !selected.empty?
827
+ end
828
+ end
829
+
830
+ # If we did not skip the element, we're going to process its
831
+ # children. Reverse order since we're popping from the stack.
832
+ if !skip_this && children = node.children
833
+ stack.concat children.reverse
834
+ end
835
+ end
836
+ ensure
837
+ @skip = nil
838
+ end
839
+ collect
840
+ return result
841
+ end
842
+
843
+
844
+ # Returns the document being processed.
845
+ #
846
+ # If the scraper was created with a URL, this method will attempt to
847
+ # retrieve the page and parse it.
848
+ #
849
+ # If the scraper was created with a string, this method will attempt
850
+ # to parse the page.
851
+ #
852
+ # Be advised that calling this method may raise an exception
853
+ # (HTTPError or HTMLParseError).
854
+ #
855
+ # The document is parsed only the first time this method is called.
856
+ def document
857
+ if @document.is_a?(URI)
858
+ # Attempt to read page. May raise HTTPError.
859
+ options = {}
860
+ READER_OPTIONS.each { |key| options[key] = option(key) }
861
+ request(@document, options)
862
+ end
863
+ if @document.is_a?(String)
864
+ # Parse the page. May raise HTMLParseError.
865
+ parsed = Reader.parse_page(@document, @page_info.encoding,
866
+ option(:parser_options), option(:parser))
867
+ @document = parsed.document
868
+ @page_info.encoding = parsed.encoding
869
+ end
870
+ return @document if @document.is_a?(HTML::Node)
871
+ raise RuntimeError, "No document to process"
872
+ end
873
+
874
+
875
+ def request(url, options)
876
+ if page = Reader.read_page(@document, options)
877
+ @page_info.url = page.url
878
+ @page_info.original_url = @document
879
+ @page_info.last_modified = page.last_modified
880
+ @page_info.etag = page.etag
881
+ @page_info.encoding = page.encoding
882
+ @document = page.content
883
+ end
884
+ end
885
+
886
+
887
+ # :call-seq:
888
+ # skip() => true
889
+ # skip(element) => true
890
+ # skip([element ...]) => true
891
+ #
892
+ # Skips processing the specified element(s).
893
+ #
894
+ # If called with a single element, that element will not be processed.
895
+ #
896
+ # If called with an array of elements, all the elements in the array
897
+ # are skipped.
898
+ #
899
+ # If called with no element, skips processing the current element.
900
+ # This has the same effect as returning true.
901
+ #
902
+ # For convenience this method always returns true. For example:
903
+ # process "h1" do |element|
904
+ # @header = element
905
+ # skip
906
+ # end
907
+ def skip(elements = nil)
908
+ case elements
909
+ when Array: @skip.concat elements
910
+ when HTML::Node: @skip << elements
911
+ when nil: @skip << true
912
+ when true, false: @skip << elements
913
+ end
914
+ # Calling skip(element) as the last statement is
915
+ # redundant by design.
916
+ return true
917
+ end
918
+
919
+
920
+ # Stops processing this page. You can call this early on if you
921
+ # discover there is no interesting information on the page, or
922
+ # done extracting all useful information.
923
+ def stop()
924
+ @stop = true
925
+ end
926
+
927
+
928
+ # Called by #scrape after creating the document, but before running
929
+ # any processing rules.
930
+ #
931
+ # You can override this method to do any preparation work.
932
+ def prepare(document)
933
+ end
934
+
935
+
936
+ # Called by #scrape scraping the document, and before calling #result.
937
+ # Typically used to run any validation, post-processing steps,
938
+ # resolving referenced elements, etc.
939
+ def collect()
940
+ end
941
+
942
+
943
+ # Returns the result of a succcessful scrape.
944
+ #
945
+ # This method is called by #scrape after running all the rules on the
946
+ # document. You can also call it directly.
947
+ #
948
+ # Override this method to return a specific object, perform post-scraping
949
+ # processing, validation, etc.
950
+ #
951
+ # The default implementation returns +self+ if any extractor returned
952
+ # true, +nil+ otherwise.
953
+ #
954
+ # If you override this method, implement your own logic to determine
955
+ # if anything was extracted and return +nil+ otherwise. Also, make sure
956
+ # calling this method multiple times returns the same result.
957
+ def result()
958
+ return self if @extracted
959
+ end
960
+
961
+
962
+ # Returns the value of an option.
963
+ #
964
+ # Returns the value of an option passed to the scraper on creation.
965
+ # If not specified, return the value of the option set for this
966
+ # scraper class. Options are inherited from the parent class.
967
+ def option(symbol)
968
+ return options.has_key?(symbol) ? options[symbol] : self.class.options[symbol]
969
+ end
970
+
971
+
972
+ end
973
+
974
+
975
+ # Define an anonymous scraper and returns the class.
976
+ #
977
+ # For example:
978
+ # links = Scraper.define do
979
+ # process "a[href]", :urls=>"@href"
980
+ # result :urls
981
+ # end
982
+ #
983
+ # puts links.scrape(html)
984
+ def self.define(&block)
985
+ kls = Class.new(Scraper::Base)
986
+ kls.module_eval &block
987
+ return kls
988
+ end
989
+
990
+ end