scrapi 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ require 'strscan'
2
+
3
+ module HTML #:nodoc:
4
+
5
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
6
+ # token is a string. Each string represents either "text", or an HTML element.
7
+ #
8
+ # This currently assumes valid XHTML, which means no free < or > characters.
9
+ #
10
+ # Usage:
11
+ #
12
+ # tokenizer = HTML::Tokenizer.new(text)
13
+ # while token = tokenizer.next
14
+ # p token
15
+ # end
16
+ class Tokenizer #:nodoc:
17
+
18
+ # The current (byte) position in the text
19
+ attr_reader :position
20
+
21
+ # The current line number
22
+ attr_reader :line
23
+
24
+ # Create a new Tokenizer for the given text.
25
+ def initialize(text)
26
+ @scanner = StringScanner.new(text)
27
+ @position = 0
28
+ @line = 0
29
+ @current_line = 1
30
+ end
31
+
32
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
33
+ # the stream.
34
+ def next
35
+ return nil if @scanner.eos?
36
+ @position = @scanner.pos
37
+ @line = @current_line
38
+ if @scanner.check(/<\S/)
39
+ update_current_line(scan_tag)
40
+ else
41
+ update_current_line(scan_text)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ # Treat the text at the current position as a tag, and scan it. Supports
48
+ # comments, doctype tags, and regular tags, and ignores less-than and
49
+ # greater-than characters within quoted strings.
50
+ def scan_tag
51
+ tag = @scanner.getch
52
+ if @scanner.scan(/!--/) # comment
53
+ tag << @scanner.matched
54
+ tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
55
+ elsif @scanner.scan(/!\[CDATA\[/)
56
+ tag << @scanner.matched
57
+ tag << @scanner.scan_until(/\]\]>/)
58
+ elsif @scanner.scan(/!/) # doctype
59
+ tag << @scanner.matched
60
+ tag << consume_quoted_regions
61
+ else
62
+ tag << consume_quoted_regions
63
+ end
64
+ tag
65
+ end
66
+
67
+ # Scan all text up to the next < character and return it.
68
+ def scan_text
69
+ "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
70
+ end
71
+
72
+ # Counts the number of newlines in the text and updates the current line
73
+ # accordingly.
74
+ def update_current_line(text)
75
+ text.scan(/\r?\n/) { @current_line += 1 }
76
+ end
77
+
78
+ # Skips over quoted strings, so that less-than and greater-than characters
79
+ # within the strings are ignored.
80
+ def consume_quoted_regions
81
+ text = ""
82
+ loop do
83
+ match = @scanner.scan_until(/['"<>]/) or break
84
+
85
+ delim = @scanner.matched
86
+ if delim == "<"
87
+ match = match.chop
88
+ @scanner.pos -= 1
89
+ end
90
+
91
+ text << match
92
+ break if delim == "<" || delim == ">"
93
+
94
+ # consume the quoted region
95
+ while match = @scanner.scan_until(/[\\#{delim}]/)
96
+ text << match
97
+ break if @scanner.matched == delim
98
+ text << @scanner.getch # skip the escaped character
99
+ end
100
+ end
101
+ text
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,11 @@
1
+ module HTML #:nodoc:
2
+ module Version #:nodoc:
3
+
4
+ MAJOR = 0
5
+ MINOR = 5
6
+ TINY = 3
7
+
8
+ STRING = [ MAJOR, MINOR, TINY ].join(".")
9
+
10
+ end
11
+ end
@@ -0,0 +1,970 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require File.join(File.dirname(__FILE__), "reader")
10
+
11
+
12
+ module Scraper
13
+
14
+ class Base
15
+
16
+
17
+ # Information about the HTML page scraped. A structure with the following
18
+ # attributes:
19
+ # * <tt>url</tt> -- The URL of the document being scraped. Passed in
20
+ # the constructor but may have changed if the page was redirected.
21
+ # * <tt>original_url</tt> -- The original URL of the document being
22
+ # scraped as passed in the constructor.
23
+ # * <tt>encoding</tt> -- The encoding of the document.
24
+ # * <tt>last_modified</tt> -- Value of the Last-Modified header returned
25
+ # from the server.
26
+ # * <tt>etag</tt> -- Value of the Etag header returned from the server.
27
+ PageInfo = Struct.new(:url, :original_url, :encoding, :last_modified, :etag)
28
+
29
+
30
+ class << self
31
+
32
+ # :call-seq:
33
+ # process(symbol?, selector, values?, extractor)
34
+ # process(symbol?, selector, values?) { |element| ... }
35
+ #
36
+ # Defines a processing rule. A processing rule consists of a selector
37
+ # that matches element, and an extractor that does something interesting
38
+ # with their value.
39
+ #
40
+ # == Symbol
41
+ #
42
+ # Rules are processed in the order in which they are defined. Use #rules
43
+ # if you need to change the order of processing.
44
+ #
45
+ # Rules can be named or anonymous. If the first argument is a symbol,
46
+ # it is used as the rule name. You can use the rule name to position,
47
+ # remove or replace it.
48
+ #
49
+ # == Selector
50
+ #
51
+ # The first argument is a selector. It selects elements from the document
52
+ # that are potential candidates for extraction. Each selected element is
53
+ # passed to the extractor.
54
+ #
55
+ # The +selector+ argument may be a string, an HTML::Selector object or
56
+ # any object that responds to the +select+ method. Passing an Array
57
+ # (responds to +select+) will not do anything useful.
58
+ #
59
+ # String selectors support value substitution, replacing question marks
60
+ # (?) in the selector expression with values from the method arguments.
61
+ # See HTML::Selector for more information.
62
+ #
63
+ # == Extractor
64
+ #
65
+ # The last argument or block is the extractor. The extractor does
66
+ # something interested with the selected element, typically assigns
67
+ # it to an instance variable of the scraper.
68
+ #
69
+ # Since the extractor is called on the scraper, it can also use the
70
+ # scraper to maintain state, e.g. this extractor counts how many
71
+ # +div+ elements appear in the document:
72
+ # process "div" { |element| @count += 1 }
73
+ #
74
+ # The extractor returns +true+ if the element was processed and
75
+ # should not be passed to any other extractor (including any child
76
+ # elements).
77
+ #
78
+ # The default implementation of #result returns +self+ only if at
79
+ # least one extractor returned +true+. However, you can override
80
+ # #result and use extractors that return +false+.
81
+ #
82
+ # A block extractor is called with a single element.
83
+ #
84
+ # You can also use the #extractor method to create extractors that
85
+ # assign elements, attributes and text values to instance variables,
86
+ # or pass a +Hash+ as the last argument to #process. See #extractor
87
+ # for more information.
88
+ #
89
+ # When using a block, the last statement is the response. Do not use
90
+ # +return+, use +next+ if you want to return a value before the last
91
+ # statement. +return+ does not do what you expect it to.
92
+ #
93
+ # == Example
94
+ #
95
+ # class ScrapePosts < Scraper::Base
96
+ # # Select the title of a post
97
+ # selector :select_title, "h2"
98
+ #
99
+ # # Select the body of a post
100
+ # selector :select_body, ".body"
101
+ #
102
+ # # All elements with class name post.
103
+ # process ".post" do |element|
104
+ # title = select_title(element)
105
+ # body = select_body(element)
106
+ # @posts << Post.new(title, body)
107
+ # true
108
+ # end
109
+ #
110
+ # attr_reader :posts
111
+ # end
112
+ #
113
+ # posts = ScrapePosts.scrape(html).posts
114
+ #
115
+ # To process only a single element:
116
+ #
117
+ # class ScrapeTitle < Scraper::Base
118
+ # process "html>head>title", :title=>text
119
+ # result :title
120
+ # end
121
+ #
122
+ # puts ScrapeTitle.scrape(html)
123
+ def process(*selector, &block)
124
+ create_process(false, *selector, &block)
125
+ end
126
+
127
+
128
+ # Similar to #process, but only extracts from the first
129
+ # selected element. Faster if you know the document contains
130
+ # only one applicable element, or only interested in processing
131
+ # the first one.
132
+ def process_first(*selector, &block)
133
+ create_process(true, *selector, &block)
134
+ end
135
+
136
+
137
+ # :call-seq:
138
+ # selector(symbol, selector, values?)
139
+ # selector(symbol, selector, values?) { |elements| ... }
140
+ #
141
+ # Create a selector method. You can call a selector method directly
142
+ # to select elements.
143
+ #
144
+ # For example, define a selector:
145
+ # selector :five_divs, "div" { |elems| elems[0..4] }
146
+ # And call it to retrieve the first five +div+ elements:
147
+ # divs = five_divs(element)
148
+ #
149
+ # Call a selector method with an element and it returns an array of
150
+ # elements that match the selector, beginning with the element argument
151
+ # itself. It returns an empty array if nothing matches.
152
+ #
153
+ # If the selector is defined with a block, all selected elements are
154
+ # passed to the block and the result of the block is returned.
155
+ #
156
+ # For convenience, a <tt>first_</tt> method is also created that
157
+ # returns (and yields) only the first selected element. For example:
158
+ # selector :post, "#post"
159
+ # @post = first_post
160
+ #
161
+ # Since the selector is defined with a block, both methods call that
162
+ # block with an array of elements.
163
+ #
164
+ # The +selector+ argument may be a string, an HTML::Selector object or
165
+ # any object that responds to the +select+ method. Passing an Array
166
+ # (responds to +select+) will not do anything useful.
167
+ #
168
+ # String selectors support value substitution, replacing question marks
169
+ # (?) in the selector expression with values from the method arguments.
170
+ # See HTML::Selector for more information.
171
+ #
172
+ # When using a block, the last statement is the response. Do not use
173
+ # +return+, use +next+ if you want to return a value before the last
174
+ # statement. +return+ does not do what you expect it to.
175
+ def selector(symbol, *selector, &block)
176
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
177
+ if selector[0].is_a?(String)
178
+ selector = HTML::Selector.new(*selector)
179
+ else
180
+ raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
181
+ selector = selector[0]
182
+ end
183
+ if block
184
+ define_method symbol do |element|
185
+ selected = selector.select(element)
186
+ return block.call(selected) unless selected.empty?
187
+ end
188
+ define_method "first_#{symbol}" do |element|
189
+ selected = selector.select_first(element)
190
+ return block.call([selected]) if selected
191
+ end
192
+ else
193
+ define_method symbol do |element|
194
+ return selector.select(element)
195
+ end
196
+ define_method "first_#{symbol}" do |element|
197
+ return selector.select_first(element)
198
+ end
199
+ end
200
+ end
201
+
202
+
203
+ # Creates an extractor that will extract values from the selected
204
+ # element and place them in instance variables of the scraper.
205
+ # You can pass the result to #process.
206
+ #
207
+ # == Example
208
+ #
209
+ # This example processes a document looking for an element with the
210
+ # class name +article+. It extracts the attribute +id+ and stores it
211
+ # in the instance variable +@id+. It extracts the article node itself
212
+ # and puts it in the instance variable +@article+.
213
+ #
214
+ # class ArticleScraper < Scraper::Base
215
+ # process ".article", extractor(:id=>"@id", :article=>:element)
216
+ # attr_reader :id, :article
217
+ # end
218
+ # result = ArticleScraper.scrape(html)
219
+ # puts result.id
220
+ # puts result.article
221
+ #
222
+ # == Sources
223
+ #
224
+ # Extractors operate on the selected element, and can extract the
225
+ # following values:
226
+ # * <tt>"elem_name"</tt> -- Extracts the element itself if it
227
+ # matches the element name (e.g. "h2" will extract only level 2
228
+ # header elements).
229
+ # * <tt>"attr_name"</tt> -- Extracts the attribute value from the
230
+ # element if specified (e.g. "@id" will extract the id attribute).
231
+ # * <tt>"elem_name@attr_name"</tt> -- Extracts the attribute value
232
+ # from the element if specified, but only if the element has the
233
+ # specified name (e.g. "h2@id").
234
+ # * <tt>:element</tt> -- Extracts the element itself.
235
+ # * <tt>:text</tt> -- Extracts the text value of the node.
236
+ # * <tt>Scraper</tt> -- Using this class creates a scraper to
237
+ # process the current element and extract the result. This can
238
+ # be used for handling complex structure.
239
+ #
240
+ # If you use an array of sources, the first source that matches
241
+ # anything is used. For example, <tt>["attr@title", :text]</tt>
242
+ # extracts the value of the +title+ attribute if the element is
243
+ # +abbr+, otherwise the text value of the element.
244
+ #
245
+ # If you use a hash, you can extract multiple values at the same
246
+ # time. For example, <tt>{:id=>"@id", :class=>"@class"}</tt>
247
+ # extracts the +id+ and +class+ attribute values.
248
+ #
249
+ # :element and :text are special cases of symbols. You can pass any
250
+ # symbol that matches a class method and that class method will
251
+ # be called to extract a value from the selected element.
252
+ # You can also pass a Proc or Method directly.
253
+ #
254
+ # And it's always possible to pass a static value, quite useful for
255
+ # processing an element with more than one rule (<tt>:skip=>false</tt>).
256
+ #
257
+ # == Targets
258
+ #
259
+ # Extractors assign the extracted value to an instance variable
260
+ # of the scraper. The instance variable contains the last value
261
+ # extracted.
262
+ #
263
+ # Also creates an accessor for that instance variable. An accessor
264
+ # is created if no such method exists. For example,
265
+ # <tt>:title=>:text</tt> creates an accessor for +title+. However,
266
+ # <tt>:id=>"@id"</tt> does not create an accessor since each
267
+ # object already has a method called +id+.
268
+ #
269
+ # If you want to extract multiple values into the same variables,
270
+ # use #array to declare that accessor as an array.
271
+ #
272
+ # Alternatively, you can append <tt>[]</tt> to the variable name.
273
+ # For example:
274
+ # process "*", "ids[]"=>"@id"
275
+ # result :ids
276
+ #
277
+ # The special target <tt>:skip</tt> allows you to control whether
278
+ # other rules can apply to the same element. By default a processing
279
+ # rule without a block (or a block that returns true) will skip
280
+ # that element so no other processing rule sees it.
281
+ #
282
+ # You can change this with <tt>:skip=>false</tt>.
283
+ def extractor(map)
284
+ extracts = []
285
+ map.each_pair do |target, source|
286
+ source = extract_value_from(source)
287
+ target = extract_value_to(target)
288
+ define_method :__extractor do |element|
289
+ value = source.call(element)
290
+ target.call(self, value) if !value.nil?
291
+ end
292
+ extracts << instance_method(:__extractor)
293
+ remove_method :__extractor
294
+ end
295
+ lambda do |element|
296
+ extracts.each do |extract|
297
+ extract.bind(self).call(element)
298
+ end
299
+ true
300
+ end
301
+ end
302
+
303
+
304
+ # Scrapes the document and returns the result.
305
+ #
306
+ # The first argument provides the input document. It can be one of:
307
+ # * <tt>URI</tt> -- Retrieve an HTML page from this URL and
308
+ # scrape it.
309
+ # * <tt>String</tt> -- The HTML page as a string.
310
+ # * <tt>HTML::Node</tt> -- An HTML node, can be a document
311
+ # or element.
312
+ #
313
+ # You can specify options for the scraper class, or override
314
+ # these by passing options in the second argument. Some options
315
+ # only make sense in the constructor.
316
+ #
317
+ # The following options are supported for reading HTML pages:
318
+ # * <tt>:last_modified</tt> -- Last-Modified header used for
319
+ # caching.
320
+ # * <tt>:etag</tt> -- ETag header used for caching.
321
+ # * <tt>:redirect_limit</tt> -- Limits number of redirects
322
+ # to follow.
323
+ # * <tt>:user_agent</tt> -- Value for User-Agent header.
324
+ # * <tt>:timeout</tt> -- HTTP open connection/read timeouts
325
+ # (in second).
326
+ #
327
+ # The following options are supported for parsing the HTML:
328
+ # * <tt>:root_element</tt> -- The root element to scrape, see
329
+ # also #root_elements.
330
+ # * <tt>:parser_options</tt> -- Specifies which parser to use.
331
+ # (Typically, you set this for the class).
332
+ # * <tt>:parser_options</tt> -- Options to pass to the parser.
333
+ #
334
+ # The result is returned by calling the #result method.
335
+ # The default implementation returns +self+ if any extractor
336
+ # returned true, +nil+ otherwise.
337
+ #
338
+ # For example:
339
+ # result = MyScraper.scrape(url, :root_element=>"body")
340
+ #
341
+ # The method may raise any number of exceptions. HTTPError
342
+ # indicates it failed to retrieve the HTML page, and HTMLParseError
343
+ # that it failed to parse the page. Other exceptions come from
344
+ # extractors and the #result method.
345
+ def scrape(source, options = nil)
346
+ scraper = self.new(source, options);
347
+ return scraper.scrape
348
+ end
349
+
350
+
351
+ # Returns the text of the element.
352
+ #
353
+ # You can use this method from an extractor, e.g.:
354
+ # process "title", :title=>:text
355
+ def text(element)
356
+ text = ""
357
+ stack = element.children.reverse
358
+ while node = stack.pop
359
+ if node.tag?
360
+ stack.concat node.children.reverse
361
+ else
362
+ text << node.content
363
+ end
364
+ end
365
+ return text
366
+ end
367
+
368
+
369
+ # Returns the element itself.
370
+ #
371
+ # You can use this method from an extractor, e.g.:
372
+ # process "h1", :header=>:element
373
+ def element(element)
374
+ element
375
+ end
376
+
377
+
378
+ # Specifies which parser to use. The default is +:tidy+.
379
+ def parser(name = :tidy)
380
+ self.options[:parser] = name
381
+ end
382
+
383
+
384
+ # Options to pass to the parser.
385
+ #
386
+ # For example, when using Tidy, you can use these options to
387
+ # tell Tidy how to clean up the HTML.
388
+ #
389
+ # This method sets the option for the class. Classes inherit options
390
+ # from their parents. You can also pass options to the scraper object
391
+ # itself using the +:parser_options+ option.
392
+ def parser_options(options)
393
+ self.options[:parser_options] = options
394
+ end
395
+
396
+
397
+ # The root element to scrape.
398
+ #
399
+ # The root element for an HTML document is +html+. However, if you want
400
+ # to scrape only the header or body, you can set the root_element to
401
+ # +head+ or +body+.
402
+ #
403
+ # This method sets the root element for the class. Classes inherit
404
+ # this option from their parents. You can also pass a root element
405
+ # to the scraper object itself using the +:root_element+ option.
406
+ def root_element(name)
407
+ self.options[:root_element] = name ? name.to_s : nil
408
+ end
409
+
410
+
411
+ # Returns the options for this class.
412
+ def options()
413
+ @options ||= {}
414
+ end
415
+
416
+
417
+ # Returns an array of rules defined for this class. You can use this
418
+ # array to change the order of rules.
419
+ def rules()
420
+ @rules ||= []
421
+ end
422
+
423
+
424
+ # Modifies this scraper to return a single value or a structure.
425
+ # Use in combination with accessors.
426
+ #
427
+ # When called with one symbol, scraping returns the result of
428
+ # calling that method (typically an accessor). When called with
429
+ # two or more symbols, scraping returns a structure of values,
430
+ # one for each symbol.
431
+ #
432
+ # For example:
433
+ # class ScrapeTitle < Scraper::Base
434
+ # process_first "html>head>title", :title=>:text
435
+ # result :title
436
+ # end
437
+ #
438
+ # puts "Title: " + ScrapeTitle.scrape(html)
439
+ #
440
+ # class ScrapeDts < Scraper::Base
441
+ # process ".dtstart", :dtstart=>["abbr@title", :text]
442
+ # process ".dtend", :dtend=>["abbr@title", :text]
443
+ # result :dtstart, :dtend
444
+ # end
445
+ #
446
+ # dts = ScrapeDts.scrape(html)
447
+ # puts "Starts: #{dts.dtstart}"
448
+ # puts "Ends: #{dts.dtend}"
449
+ def result(*symbols)
450
+ raise ArgumentError, "Use one symbol to return the value of this accessor, multiple symbols to returns a structure" if symbols.empty?
451
+ symbols = symbols.map {|s| s.to_sym}
452
+ if symbols.size == 1
453
+ define_method :result do
454
+ return self.send(symbols[0])
455
+ end
456
+ else
457
+ struct = Struct.new(*symbols)
458
+ define_method :result do
459
+ return struct.new(*symbols.collect {|s| self.send(s) })
460
+ end
461
+ end
462
+ end
463
+
464
+
465
+ # Declares which accessors are arrays. You can declare the
466
+ # accessor here, or use "symbol[]" as the target.
467
+ #
468
+ # For example:
469
+ # array :urls
470
+ # process "a[href]", :urls=>"@href"
471
+ # Is equivalent to:
472
+ # process "a[href]", "urls[]"=>"@href"
473
+ def array(*symbols)
474
+ @arrays ||= []
475
+ symbols.each { |sym| @arrays << sym.to_sym }
476
+ end
477
+
478
+
479
+ private
480
+
481
+
482
+ # Called by #process and #process_first, see there for
483
+ # documentation. First argument indicates whether to
484
+ # process only the first matching element (+true+) or
485
+ # all matching elements (+false+).
486
+ def create_process(first, *selector, &block)
487
+ # First argument may be the rule name.
488
+ name = selector.shift if selector.first.is_a?(Symbol)
489
+ # Extractor is either a block, last argument or both.
490
+ if selector.last.is_a?(Proc)
491
+ extractor = selector.pop
492
+ elsif selector.last.is_a?(Hash)
493
+ extractor = extractor(selector.pop)
494
+ end
495
+ if block && extractor
496
+ # Ugly, but no other way to chain two calls bound to the
497
+ # scraper instance.
498
+ define_method :__extractor, extractor
499
+ extractor1 = instance_method(:__extractor)
500
+ define_method :__extractor, block
501
+ extractor2 = instance_method(:__extractor)
502
+ remove_method :__extractor
503
+ extractor = lambda do |element|
504
+ extractor1.bind(self).call(element)
505
+ extractor2.bind(self).call(element)
506
+ end
507
+ elsif block
508
+ extractor = block
509
+ end
510
+ raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
511
+ # And if we think the extractor is the last argument,
512
+ # it's certainly not the selector.
513
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
514
+ if selector[0].is_a?(String)
515
+ selector = HTML::Selector.new(*selector)
516
+ else
517
+ raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
518
+ selector = selector[0]
519
+ end
520
+ # Create a method for fast evaluation.
521
+ define_method :__extractor, extractor
522
+ method = instance_method(:__extractor)
523
+ remove_method :__extractor
524
+ # Decide where to put the rule.
525
+ pos = rules.length
526
+ if name
527
+ if find = rules.find {|rule| rule[2] == name }
528
+ find[0] = selector
529
+ find[1] = method
530
+ else
531
+ rules << [selector, method, name, first]
532
+ end
533
+ else
534
+ rules << [selector, method, name, first]
535
+ end
536
+ end
537
+
538
+
539
+ # Returns a Proc that will extract a value from an element.
540
+ #
541
+ # The +source+ argument specifies which value to extract.
542
+ # See #extractor for more details.
543
+ #
544
+ # The Proc is called with an element and returns a value
545
+ # or +nil+.
546
+ def extract_value_from(source)
547
+ case source
548
+ when Array
549
+ # For an array, each item is itself a source argument.
550
+ # We stop at the first value we're able to extract.
551
+ array = source.collect { |i| extract_value_from(i) }
552
+ return lambda do |element|
553
+ result = nil
554
+ array.each { |proc| break if result = proc.call(element) }
555
+ result
556
+ end
557
+ when Hash
558
+ # For a hash, each pair is a symbol and source argument.
559
+ # We extract all the values and set them in the hash.
560
+ hash = source.inject({}) { |h,p| h[p[0]] = extract_value_from(p[1]) ; h }
561
+ return lambda do |element|
562
+ result = {}
563
+ hash.each_pair do |source, target|
564
+ if value = target.call(element)
565
+ result[source] = value
566
+ end
567
+ end
568
+ result unless result.empty?
569
+ end
570
+ when Class
571
+ # A class is a scraper we run on the extracted element.
572
+ # It must extend Scraper::Base.
573
+ klass = source
574
+ while klass = klass.superclass
575
+ break if klass == Scraper::Base
576
+ end
577
+ raise ArgumentError, "Class must be a scraper that extends Scraper::Base" unless klass
578
+ return lambda { |element| source.new(element).scrape }
579
+ when Symbol
580
+ # A symbol is a method we call. We pass it the element
581
+ # and it returns the extracted value. It must be a class method.
582
+ method = method(source) rescue
583
+ raise(ArgumentError, "No method #{source} in #{self.class}")
584
+ return lambda { |element| method.call(element) }
585
+ when Proc, Method
586
+ # Self evident.
587
+ raise ArgumentError, "Proc or Method must take one argument (an element)" if source.arity == 0
588
+ return source
589
+ when /^[\w\-:]+$/
590
+ # An element name. Return the element if the name matches.
591
+ return lambda { |element| element if element.name == source }
592
+ when /^@[\w\-:]+$/
593
+ # An attribute name. Return its value if the attribute is specified.
594
+ attr_name = source[1..-1]
595
+ return lambda { |element| element.attributes[attr_name] }
596
+ when /^[\w\-:]+@[\w\-:]+$/
597
+ # An element with attribute name. Return the attribute value if
598
+ # the attribute is specified, and the element name matches.
599
+ tag_name, attr_name = source.match(/^([\w\-:]+)@([\w\-:]+)$/)[1..2]
600
+ return lambda do |element|
601
+ element.attributes[attr_name] if element.name == tag_name
602
+ end
603
+ else
604
+ return lambda { |element| source }
605
+ # Anything else and pianos fall from the sky.
606
+ raise ArgumentError, "Invalid extractor #{source.to_s}"
607
+ end
608
+ end
609
+
610
+
611
+ # Returns a Proc that will set the extract value in the object.
612
+ #
613
+ # The +target+ argument identifies an instance variable. It may
614
+ # be the name of a variable, or the name of a variable prefixed
615
+ # with [] to denote an array.
616
+ #
617
+ # The Proc is called with two arguments: the object to set the
618
+ # value in, and the value.
619
+ def extract_value_to(target)
620
+ if target.is_a?(Array)
621
+ setters = target.collect do |target|
622
+ [target,extract_value_to(target)]
623
+ end
624
+ return lambda do |object,value|
625
+ setters.each do |setter|
626
+ setter[1].call(object, value.send(setter[0]))
627
+ end
628
+ end
629
+ end
630
+
631
+ if target.to_sym == :skip
632
+ return lambda do |object, value|
633
+ object.send(:skip, value)
634
+ end
635
+ end
636
+
637
+ target = target.to_s
638
+ if target[-2..-1] == "[]" or (@arrays && array = @arrays.include?(target.to_sym))
639
+ target = target[0...-2] unless array
640
+ # Create an attribute accessor is not already defined.
641
+ begin
642
+ self.instance_method(target)
643
+ rescue NameError
644
+ attr_accessor target
645
+ end
646
+ reader = "#{target}".to_sym
647
+ writer = "#{target}=".to_sym
648
+ return lambda do |object, value|
649
+ array = object.send(reader)
650
+ object.send(writer, array = []) unless array
651
+ array << value
652
+ end
653
+ else
654
+ # Create an attribute accessor is not already defined.
655
+ begin
656
+ self.instance_method(target)
657
+ rescue NameError
658
+ attr_accessor target
659
+ end
660
+ reader = "#{target}=".to_sym
661
+ return lambda { |object, value| object.send(reader, value) }
662
+ end
663
+ end
664
+
665
+
666
+ def inherited(child)
667
+ super
668
+ # Duplicate options, rules and arrays rules to any inherited class.
669
+ child.options.update self.options
670
+ child.rules.concat self.rules
671
+ child.instance_variable_set :@arrays, self.instance_variable_get(:@arrays)
672
+ end
673
+
674
+ end
675
+
676
+
677
+ unless const_defined? :READER_OPTIONS
678
+ READER_OPTIONS = [:last_modified, :etag, :redirect_limit, :user_agent, :timeout]
679
+ end
680
+
681
+
682
+ # Set to true when the first extractor returns true.
683
+ attr_accessor :extracted
684
+
685
+
686
+ # Information about the HTML page scraped. See PageInfo.
687
+ attr_accessor :page_info
688
+
689
+
690
+ # Returns the options for this object.
691
+ attr_accessor :options
692
+
693
+
694
+ # Create a new scraper instance.
695
+ #
696
+ # The argument +source+ is a URL, string containing HTML, or HTML::Node.
697
+ # The optional argument +options+ are options passed to the scraper.
698
+ # See Base#scrape for more details.
699
+ #
700
+ # For example:
701
+ # # The page we want to scrape
702
+ # url = URI.parse("http://example.com")
703
+ # # Skip the header
704
+ # scraper = MyScraper.new(url, :root_element=>"body")
705
+ # result = scraper.scrape
706
+ def initialize(source, options = nil)
707
+ @page_info = PageInfo[]
708
+ @options = options || {}
709
+ case source
710
+ when URI
711
+ @document = source
712
+ when String, HTML::Node
713
+ @document = source
714
+ # TODO: document and test case these two.
715
+ @page_info.url = @page_info.original_url = @options[:url]
716
+ @page_info.encoding = @options[:encoding]
717
+ else
718
+ raise ArgumentError, "Can only scrape URI, String or HTML::Node"
719
+ end
720
+ end
721
+
722
+
723
+ # Scrapes the document and returns the result.
724
+ #
725
+ # If the scraper was created with a URL, retrieve the page and parse it.
726
+ # If the scraper was created with a string, parse the page.
727
+ #
728
+ # The result is returned by calling the #result method. The default
729
+ # implementation returns +self+ if any extractor returned true,
730
+ # +nil+ otherwise.
731
+ #
732
+ # The method may raise any number of exceptions. HTTPError indicates
733
+ # it failed to retrieve the HTML page, and HTMLParseError that it failed
734
+ # to parse the page. Other exceptions come from extractors and the
735
+ # #result method.
736
+ #
737
+ # See also Base#scrape.
738
+ def scrape()
739
+ # Call prepare with the document, but before doing anything else.
740
+ prepare document
741
+ # Retrieve the document. This may raise HTTPError or HTMLParseError.
742
+ case document
743
+ when Array: stack = @document.reverse # see below
744
+ when HTML::Node:
745
+ # If a root element is specified, start selecting from there.
746
+ # The stack is empty if we can't find any root element (makes
747
+ # sense). However, the node we're going to process may be
748
+ # a tag, or an HTML::Document.root which is the equivalent of
749
+ # a document fragment.
750
+ root_element = option(:root_element)
751
+ root = root_element ? @document.find(:tag=>root_element) : @document
752
+ stack = root ? (root.tag? ? [root] : root.children.reverse) : []
753
+ else return
754
+ end
755
+ # @skip stores all the elements we want to skip (see #skip).
756
+ # rules stores all the rules we want to process with this
757
+ # scraper, based on the class definition.
758
+ @skip = []
759
+ @stop = false
760
+ rules = self.class.rules.clone
761
+ begin
762
+ # Process the document one node at a time. We process elements
763
+ # from the end of the stack, so each time we visit child elements,
764
+ # we add them to the end of the stack in reverse order.
765
+ while node = stack.pop
766
+ break if @stop
767
+ skip_this = false
768
+ # Only match nodes that are elements, ignore text nodes.
769
+ # Also ignore any element that's on the skip list, and if
770
+ # found one, remove it from the list (since we never visit
771
+ # the same element twice). But an element may be added twice
772
+ # to the skip list.
773
+ # Note: equal? is faster than == for nodes.
774
+ next unless node.tag?
775
+ @skip.delete_if { |s| skip_this = true if s.equal?(node) }
776
+ next if skip_this
777
+
778
+ # Run through all the rules until we process the element or
779
+ # run out of rules. If skip_this=true then we processed the
780
+ # element and we can break out of the loop. However, we might
781
+ # process (and skip) descedants so also watch the skip list.
782
+ rules.delete_if do |selector, extractor, rule_name, first_only|
783
+ break if skip_this
784
+ # The result of calling match (selected) is nil, element
785
+ # or array of elements. We turn it into an array to
786
+ # process one element at a time. We process all elements
787
+ # that are not on the skip list (we haven't visited
788
+ # them yet).
789
+ if selected = selector.match(node, first_only)
790
+ selected = [selected] unless selected.is_a?(Array)
791
+ selected = [selected.first] if first_only
792
+ selected.each do |element|
793
+ # Do not process elements we already skipped
794
+ # (see above). However, this time we may visit
795
+ # an element twice, since selected elements may
796
+ # be descendants of the current element on the
797
+ # stack. In rare cases two elements on the stack
798
+ # may pick the same descendants.
799
+ next if @skip.find { |s| s.equal?(element) }
800
+ # Call the extractor method with this element.
801
+ # If it returns true, skip the element and if
802
+ # the current element, don't process any more
803
+ # rules. Again, pay attention to descendants.
804
+ skip = extractor.bind(self).call(element)
805
+ if (skip || @skip.delete(true)) && @skip.delete(false).nil?
806
+ @extracted = true
807
+ if element.equal?(node)
808
+ skip_this = true
809
+ else
810
+ @skip << element
811
+ end
812
+ end
813
+ end
814
+ first_only if !selected.empty?
815
+ end
816
+ end
817
+
818
+ # If we did not skip the element, we're going to process its
819
+ # children. Reverse order since we're popping from the stack.
820
+ if !skip_this && children = node.children
821
+ stack.concat children.reverse
822
+ end
823
+ end
824
+ ensure
825
+ @skip = nil
826
+ end
827
+ return result
828
+ end
829
+
830
+
831
+ # Returns the document being processed.
832
+ #
833
+ # If the scraper was created with a URL, this method will attempt to
834
+ # retrieve the page and parse it.
835
+ #
836
+ # If the scraper was created with a string, this method will attempt
837
+ # to parse the page.
838
+ #
839
+ # Be advised that calling this method may raise an exception
840
+ # (HTTPError or HTMLParseError).
841
+ #
842
+ # The document is parsed only the first time this method is called.
843
+ def document
844
+ if @document.is_a?(URI)
845
+ # Attempt to read page. May raise HTTPError.
846
+ options = {}
847
+ READER_OPTIONS.each { |key| options[key] = option(key) }
848
+ request(@document, options)
849
+ end
850
+ if @document.is_a?(String)
851
+ # Parse the page. May raise HTMLParseError.
852
+ parsed = Reader.parse_page(@document, @page_info.encoding,
853
+ option(:parser_options), option(:parser))
854
+ @document = parsed.document
855
+ @page_info.encoding = parsed.encoding
856
+ end
857
+ return @document if @document.is_a?(HTML::Node)
858
+ raise RuntimeError, "No document to process"
859
+ end
860
+
861
+
862
+ def request(url, options)
863
+ if page = Reader.read_page(@document, options)
864
+ @page_info.url = page.url
865
+ @page_info.original_url = @document
866
+ @page_info.last_modified = page.last_modified
867
+ @page_info.etag = page.etag
868
+ @page_info.encoding = page.encoding
869
+ @document = page.content
870
+ end
871
+ end
872
+
873
+
874
+ # :call-seq:
875
+ # skip() => true
876
+ # skip(element) => true
877
+ # skip([element ...]) => true
878
+ #
879
+ # Skips processing the specified element(s).
880
+ #
881
+ # If called with a single element, that element will not be processed.
882
+ #
883
+ # If called with an array of elements, all the elements in the array
884
+ # are skipped.
885
+ #
886
+ # If called with no element, skips processing the current element.
887
+ # This has the same effect as returning true.
888
+ #
889
+ # For convenience this method always returns true. For example:
890
+ # process "h1" do |element|
891
+ # @header = element
892
+ # skip
893
+ # end
894
+ def skip(elements = nil)
895
+ case elements
896
+ when Array: @skip.concat elements
897
+ when HTML::Node: @skip << elements
898
+ when nil: @skip << self.element
899
+ when true, false: @skip << elements
900
+ end
901
+ # Calling skip(element) as the last statement is
902
+ # redundant by design.
903
+ return true
904
+ end
905
+
906
+
907
+ # Stops processing this page. You can call this early on if you
908
+ # discover there is no interesting information on the page, or
909
+ # done extracting all useful information.
910
+ def stop()
911
+ @stop = true
912
+ end
913
+
914
+
915
+ # Called by #scrape after creating the document, but before running
916
+ # any processing rules.
917
+ #
918
+ # You can override this method to do any preparation work.
919
+ def prepare(document)
920
+ end
921
+
922
+
923
+ # Returns the result of a succcessful scrape.
924
+ #
925
+ # This method is called by #scrape after running all the rules on the
926
+ # document. You can also call it directly.
927
+ #
928
+ # Override this method to return a specific object, perform post-scraping
929
+ # processing, validation, etc.
930
+ #
931
+ # The default implementation returns +self+ if any extractor returned
932
+ # true, +nil+ otherwise.
933
+ #
934
+ # If you override this method, implement your own logic to determine
935
+ # if anything was extracted and return +nil+ otherwise. Also, make sure
936
+ # calling this method multiple times returns the same result.
937
+ def result()
938
+ return self if @extracted
939
+ end
940
+
941
+
942
+ # Returns the value of an option.
943
+ #
944
+ # Returns the value of an option passed to the scraper on creation.
945
+ # If not specified, return the value of the option set for this
946
+ # scraper class. Options are inherited from the parent class.
947
+ def option(symbol)
948
+ return options.has_key?(symbol) ? options[symbol] : self.class.options[symbol]
949
+ end
950
+
951
+
952
+ end
953
+
954
+
955
+ # Define an anonymous scraper and returns the class.
956
+ #
957
+ # For example:
958
+ # links = Scraper.define do
959
+ # process "a[href]", :urls=>"@href"
960
+ # result :urls
961
+ # end
962
+ #
963
+ # puts links.scrape(html)
964
+ def self.define(&block)
965
+ kls = Class.new(Scraper::Base)
966
+ kls.module_eval &block
967
+ return kls
968
+ end
969
+
970
+ end