assaf-scrapi 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ require 'strscan'
2
+
3
+ module HTML #:nodoc:
4
+
5
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
6
+ # token is a string. Each string represents either "text", or an HTML element.
7
+ #
8
+ # This currently assumes valid XHTML, which means no free < or > characters.
9
+ #
10
+ # Usage:
11
+ #
12
+ # tokenizer = HTML::Tokenizer.new(text)
13
+ # while token = tokenizer.next
14
+ # p token
15
+ # end
16
+ class Tokenizer #:nodoc:
17
+
18
+ # The current (byte) position in the text
19
+ attr_reader :position
20
+
21
+ # The current line number
22
+ attr_reader :line
23
+
24
+ # Create a new Tokenizer for the given text.
25
+ def initialize(text)
26
+ @scanner = StringScanner.new(text)
27
+ @position = 0
28
+ @line = 0
29
+ @current_line = 1
30
+ end
31
+
32
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
33
+ # the stream.
34
+ def next
35
+ return nil if @scanner.eos?
36
+ @position = @scanner.pos
37
+ @line = @current_line
38
+ if @scanner.check(/<\S/)
39
+ update_current_line(scan_tag)
40
+ else
41
+ update_current_line(scan_text)
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ # Treat the text at the current position as a tag, and scan it. Supports
48
+ # comments, doctype tags, and regular tags, and ignores less-than and
49
+ # greater-than characters within quoted strings.
50
+ def scan_tag
51
+ tag = @scanner.getch
52
+ if @scanner.scan(/!--/) # comment
53
+ tag << @scanner.matched
54
+ tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
55
+ elsif @scanner.scan(/!\[CDATA\[/)
56
+ tag << @scanner.matched
57
+ tag << @scanner.scan_until(/\]\]>/)
58
+ elsif @scanner.scan(/!/) # doctype
59
+ tag << @scanner.matched
60
+ tag << consume_quoted_regions
61
+ else
62
+ tag << consume_quoted_regions
63
+ end
64
+ tag
65
+ end
66
+
67
+ # Scan all text up to the next < character and return it.
68
+ def scan_text
69
+ "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
70
+ end
71
+
72
+ # Counts the number of newlines in the text and updates the current line
73
+ # accordingly.
74
+ def update_current_line(text)
75
+ text.scan(/\r?\n/) { @current_line += 1 }
76
+ end
77
+
78
+ # Skips over quoted strings, so that less-than and greater-than characters
79
+ # within the strings are ignored.
80
+ def consume_quoted_regions
81
+ text = ""
82
+ loop do
83
+ match = @scanner.scan_until(/['"<>]/) or break
84
+
85
+ delim = @scanner.matched
86
+ if delim == "<"
87
+ match = match.chop
88
+ @scanner.pos -= 1
89
+ end
90
+
91
+ text << match
92
+ break if delim == "<" || delim == ">"
93
+
94
+ # consume the quoted region
95
+ while match = @scanner.scan_until(/[\\#{delim}]/)
96
+ text << match
97
+ break if @scanner.matched == delim
98
+ text << @scanner.getch # skip the escaped character
99
+ end
100
+ end
101
+ text
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,11 @@
1
+ module HTML #:nodoc:
2
+ module Version #:nodoc:
3
+
4
+ MAJOR = 0
5
+ MINOR = 5
6
+ TINY = 3
7
+
8
+ STRING = [ MAJOR, MINOR, TINY ].join(".")
9
+
10
+ end
11
+ end
@@ -0,0 +1,990 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require File.join(File.dirname(__FILE__), "reader")
10
+
11
+
12
+ module Scraper
13
+
14
+ class Base
15
+
16
+
17
+ # Information about the HTML page scraped. A structure with the following
18
+ # attributes:
19
+ # * <tt>url</tt> -- The URL of the document being scraped. Passed in
20
+ # the constructor but may have changed if the page was redirected.
21
+ # * <tt>original_url</tt> -- The original URL of the document being
22
+ # scraped as passed in the constructor.
23
+ # * <tt>encoding</tt> -- The encoding of the document.
24
+ # * <tt>last_modified</tt> -- Value of the Last-Modified header returned
25
+ # from the server.
26
+ # * <tt>etag</tt> -- Value of the Etag header returned from the server.
27
+ PageInfo = Struct.new(:url, :original_url, :encoding, :last_modified, :etag)
28
+
29
+
30
+ class << self
31
+
32
+ # :call-seq:
33
+ # process(symbol?, selector, values?, extractor)
34
+ # process(symbol?, selector, values?) { |element| ... }
35
+ #
36
+ # Defines a processing rule. A processing rule consists of a selector
37
+ # that matches element, and an extractor that does something interesting
38
+ # with their value.
39
+ #
40
+ # == Symbol
41
+ #
42
+ # Rules are processed in the order in which they are defined. Use #rules
43
+ # if you need to change the order of processing.
44
+ #
45
+ # Rules can be named or anonymous. If the first argument is a symbol,
46
+ # it is used as the rule name. You can use the rule name to position,
47
+ # remove or replace it.
48
+ #
49
+ # == Selector
50
+ #
51
+ # The first argument is a selector. It selects elements from the document
52
+ # that are potential candidates for extraction. Each selected element is
53
+ # passed to the extractor.
54
+ #
55
+ # The +selector+ argument may be a string, an HTML::Selector object or
56
+ # any object that responds to the +select+ method. Passing an Array
57
+ # (responds to +select+) will not do anything useful.
58
+ #
59
+ # String selectors support value substitution, replacing question marks
60
+ # (?) in the selector expression with values from the method arguments.
61
+ # See HTML::Selector for more information.
62
+ #
63
+ # == Extractor
64
+ #
65
+ # The last argument or block is the extractor. The extractor does
66
+ # something interested with the selected element, typically assigns
67
+ # it to an instance variable of the scraper.
68
+ #
69
+ # Since the extractor is called on the scraper, it can also use the
70
+ # scraper to maintain state, e.g. this extractor counts how many
71
+ # +div+ elements appear in the document:
72
+ # process "div" { |element| @count += 1 }
73
+ #
74
+ # The extractor returns +true+ if the element was processed and
75
+ # should not be passed to any other extractor (including any child
76
+ # elements).
77
+ #
78
+ # The default implementation of #result returns +self+ only if at
79
+ # least one extractor returned +true+. However, you can override
80
+ # #result and use extractors that return +false+.
81
+ #
82
+ # A block extractor is called with a single element.
83
+ #
84
+ # You can also use the #extractor method to create extractors that
85
+ # assign elements, attributes and text values to instance variables,
86
+ # or pass a +Hash+ as the last argument to #process. See #extractor
87
+ # for more information.
88
+ #
89
+ # When using a block, the last statement is the response. Do not use
90
+ # +return+, use +next+ if you want to return a value before the last
91
+ # statement. +return+ does not do what you expect it to.
92
+ #
93
+ # == Example
94
+ #
95
+ # class ScrapePosts < Scraper::Base
96
+ # # Select the title of a post
97
+ # selector :select_title, "h2"
98
+ #
99
+ # # Select the body of a post
100
+ # selector :select_body, ".body"
101
+ #
102
+ # # All elements with class name post.
103
+ # process ".post" do |element|
104
+ # title = select_title(element)
105
+ # body = select_body(element)
106
+ # @posts << Post.new(title, body)
107
+ # true
108
+ # end
109
+ #
110
+ # attr_reader :posts
111
+ # end
112
+ #
113
+ # posts = ScrapePosts.scrape(html).posts
114
+ #
115
+ # To process only a single element:
116
+ #
117
+ # class ScrapeTitle < Scraper::Base
118
+ # process "html>head>title", :title=>text
119
+ # result :title
120
+ # end
121
+ #
122
+ # puts ScrapeTitle.scrape(html)
123
+ def process(*selector, &block)
124
+ create_process(false, *selector, &block)
125
+ end
126
+
127
+
128
+ # Similar to #process, but only extracts from the first
129
+ # selected element. Faster if you know the document contains
130
+ # only one applicable element, or only interested in processing
131
+ # the first one.
132
+ def process_first(*selector, &block)
133
+ create_process(true, *selector, &block)
134
+ end
135
+
136
+
137
+ # :call-seq:
138
+ # selector(symbol, selector, values?)
139
+ # selector(symbol, selector, values?) { |elements| ... }
140
+ #
141
+ # Create a selector method. You can call a selector method directly
142
+ # to select elements.
143
+ #
144
+ # For example, define a selector:
145
+ # selector :five_divs, "div" { |elems| elems[0..4] }
146
+ # And call it to retrieve the first five +div+ elements:
147
+ # divs = five_divs(element)
148
+ #
149
+ # Call a selector method with an element and it returns an array of
150
+ # elements that match the selector, beginning with the element argument
151
+ # itself. It returns an empty array if nothing matches.
152
+ #
153
+ # If the selector is defined with a block, all selected elements are
154
+ # passed to the block and the result of the block is returned.
155
+ #
156
+ # For convenience, a <tt>first_</tt> method is also created that
157
+ # returns (and yields) only the first selected element. For example:
158
+ # selector :post, "#post"
159
+ # @post = first_post
160
+ #
161
+ # Since the selector is defined with a block, both methods call that
162
+ # block with an array of elements.
163
+ #
164
+ # The +selector+ argument may be a string, an HTML::Selector object or
165
+ # any object that responds to the +select+ method. Passing an Array
166
+ # (responds to +select+) will not do anything useful.
167
+ #
168
+ # String selectors support value substitution, replacing question marks
169
+ # (?) in the selector expression with values from the method arguments.
170
+ # See HTML::Selector for more information.
171
+ #
172
+ # When using a block, the last statement is the response. Do not use
173
+ # +return+, use +next+ if you want to return a value before the last
174
+ # statement. +return+ does not do what you expect it to.
175
+ def selector(symbol, *selector, &block)
176
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
177
+ if selector[0].is_a?(String)
178
+ selector = HTML::Selector.new(*selector)
179
+ else
180
+ raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
181
+ selector = selector[0]
182
+ end
183
+ if block
184
+ define_method symbol do |element|
185
+ selected = selector.select(element)
186
+ return block.call(selected) unless selected.empty?
187
+ end
188
+ define_method "first_#{symbol}" do |element|
189
+ selected = selector.select_first(element)
190
+ return block.call([selected]) if selected
191
+ end
192
+ else
193
+ define_method symbol do |element|
194
+ return selector.select(element)
195
+ end
196
+ define_method "first_#{symbol}" do |element|
197
+ return selector.select_first(element)
198
+ end
199
+ end
200
+ end
201
+
202
+
203
+ # Creates an extractor that will extract values from the selected
204
+ # element and place them in instance variables of the scraper.
205
+ # You can pass the result to #process.
206
+ #
207
+ # == Example
208
+ #
209
+ # This example processes a document looking for an element with the
210
+ # class name +article+. It extracts the attribute +id+ and stores it
211
+ # in the instance variable +@id+. It extracts the article node itself
212
+ # and puts it in the instance variable +@article+.
213
+ #
214
+ # class ArticleScraper < Scraper::Base
215
+ # process ".article", extractor(:id=>"@id", :article=>:element)
216
+ # attr_reader :id, :article
217
+ # end
218
+ # result = ArticleScraper.scrape(html)
219
+ # puts result.id
220
+ # puts result.article
221
+ #
222
+ # == Sources
223
+ #
224
+ # Extractors operate on the selected element, and can extract the
225
+ # following values:
226
+ # * <tt>"elem_name"</tt> -- Extracts the element itself if it
227
+ # matches the element name (e.g. "h2" will extract only level 2
228
+ # header elements).
229
+ # * <tt>"attr_name"</tt> -- Extracts the attribute value from the
230
+ # element if specified (e.g. "@id" will extract the id attribute).
231
+ # * <tt>"elem_name@attr_name"</tt> -- Extracts the attribute value
232
+ # from the element if specified, but only if the element has the
233
+ # specified name (e.g. "h2@id").
234
+ # * <tt>:element</tt> -- Extracts the element itself.
235
+ # * <tt>:text</tt> -- Extracts the text value of the node.
236
+ # * <tt>Scraper</tt> -- Using this class creates a scraper to
237
+ # process the current element and extract the result. This can
238
+ # be used for handling complex structure.
239
+ #
240
+ # If you use an array of sources, the first source that matches
241
+ # anything is used. For example, <tt>["attr@title", :text]</tt>
242
+ # extracts the value of the +title+ attribute if the element is
243
+ # +abbr+, otherwise the text value of the element.
244
+ #
245
+ # If you use a hash, you can extract multiple values at the same
246
+ # time. For example, <tt>{:id=>"@id", :class=>"@class"}</tt>
247
+ # extracts the +id+ and +class+ attribute values.
248
+ #
249
+ # :element and :text are special cases of symbols. You can pass any
250
+ # symbol that matches a class method and that class method will
251
+ # be called to extract a value from the selected element.
252
+ # You can also pass a Proc or Method directly.
253
+ #
254
+ # And it's always possible to pass a static value, quite useful for
255
+ # processing an element with more than one rule (<tt>:skip=>false</tt>).
256
+ #
257
+ # == Targets
258
+ #
259
+ # Extractors assign the extracted value to an instance variable
260
+ # of the scraper. The instance variable contains the last value
261
+ # extracted.
262
+ #
263
+ # Also creates an accessor for that instance variable. An accessor
264
+ # is created if no such method exists. For example,
265
+ # <tt>:title=>:text</tt> creates an accessor for +title+. However,
266
+ # <tt>:id=>"@id"</tt> does not create an accessor since each
267
+ # object already has a method called +id+.
268
+ #
269
+ # If you want to extract multiple values into the same variables,
270
+ # use #array to declare that accessor as an array.
271
+ #
272
+ # Alternatively, you can append <tt>[]</tt> to the variable name.
273
+ # For example:
274
+ # process "*", "ids[]"=>"@id"
275
+ # result :ids
276
+ #
277
+ # The special target <tt>:skip</tt> allows you to control whether
278
+ # other rules can apply to the same element. By default a processing
279
+ # rule without a block (or a block that returns true) will skip
280
+ # that element so no other processing rule sees it.
281
+ #
282
+ # You can change this with <tt>:skip=>false</tt>.
283
+ def extractor(map)
284
+ extracts = []
285
+ map.each_pair do |target, source|
286
+ source = extract_value_from(source)
287
+ target = extract_value_to(target)
288
+ define_method :__extractor do |element|
289
+ value = source.call(element)
290
+ target.call(self, value) if !value.nil?
291
+ end
292
+ extracts << instance_method(:__extractor)
293
+ remove_method :__extractor
294
+ end
295
+ lambda do |element|
296
+ extracts.each do |extract|
297
+ extract.bind(self).call(element)
298
+ end
299
+ true
300
+ end
301
+ end
302
+
303
+
304
+ # Scrapes the document and returns the result.
305
+ #
306
+ # The first argument provides the input document. It can be one of:
307
+ # * <tt>URI</tt> -- Retrieve an HTML page from this URL and
308
+ # scrape it.
309
+ # * <tt>String</tt> -- The HTML page as a string.
310
+ # * <tt>HTML::Node</tt> -- An HTML node, can be a document
311
+ # or element.
312
+ #
313
+ # You can specify options for the scraper class, or override
314
+ # these by passing options in the second argument. Some options
315
+ # only make sense in the constructor.
316
+ #
317
+ # The following options are supported for reading HTML pages:
318
+ # * <tt>:last_modified</tt> -- Last-Modified header used for
319
+ # caching.
320
+ # * <tt>:etag</tt> -- ETag header used for caching.
321
+ # * <tt>:redirect_limit</tt> -- Limits number of redirects
322
+ # to follow.
323
+ # * <tt>:user_agent</tt> -- Value for User-Agent header.
324
+ # * <tt>:timeout</tt> -- HTTP open connection/read timeouts
325
+ # (in second).
326
+ #
327
+ # The following options are supported for parsing the HTML:
328
+ # * <tt>:root_element</tt> -- The root element to scrape, see
329
+ # also #root_elements.
330
+ # * <tt>:parser</tt> -- Specifies which parser to use.
331
+ # (Typically, you set this for the class).
332
+ # * <tt>:parser_options</tt> -- Options to pass to the parser.
333
+ #
334
+ # The result is returned by calling the #result method.
335
+ # The default implementation returns +self+ if any extractor
336
+ # returned true, +nil+ otherwise.
337
+ #
338
+ # For example:
339
+ # result = MyScraper.scrape(url, :root_element=>"body")
340
+ #
341
+ # The method may raise any number of exceptions. HTTPError
342
+ # indicates it failed to retrieve the HTML page, and HTMLParseError
343
+ # that it failed to parse the page. Other exceptions come from
344
+ # extractors and the #result method.
345
+ def scrape(source, options = nil)
346
+ scraper = self.new(source, options);
347
+ return scraper.scrape
348
+ end
349
+
350
+
351
+ # Returns the text of the element.
352
+ #
353
+ # You can use this method from an extractor, e.g.:
354
+ # process "title", :title=>:text
355
+ def text(element)
356
+ text = ""
357
+ stack = element.children.reverse
358
+ while node = stack.pop
359
+ if node.tag?
360
+ stack.concat node.children.reverse
361
+ else
362
+ text << node.content
363
+ end
364
+ end
365
+ return text
366
+ end
367
+
368
+
369
+ # Returns the element itself.
370
+ #
371
+ # You can use this method from an extractor, e.g.:
372
+ # process "h1", :header=>:element
373
+ def element(element)
374
+ element
375
+ end
376
+
377
+
378
+ # Specifies which parser to use. The default is +:tidy+.
379
+ def parser(name = :tidy)
380
+ self.options[:parser] = name
381
+ end
382
+
383
+
384
+ # Options to pass to the parser.
385
+ #
386
+ # For example, when using Tidy, you can use these options to
387
+ # tell Tidy how to clean up the HTML.
388
+ #
389
+ # This method sets the option for the class. Classes inherit options
390
+ # from their parents. You can also pass options to the scraper object
391
+ # itself using the +:parser_options+ option.
392
+ def parser_options(options)
393
+ self.options[:parser_options] = options
394
+ end
395
+
396
+
397
+ # The root element to scrape.
398
+ #
399
+ # The root element for an HTML document is +html+. However, if you want
400
+ # to scrape only the header or body, you can set the root_element to
401
+ # +head+ or +body+.
402
+ #
403
+ # This method sets the root element for the class. Classes inherit
404
+ # this option from their parents. You can also pass a root element
405
+ # to the scraper object itself using the +:root_element+ option.
406
+ def root_element(name)
407
+ self.options[:root_element] = name ? name.to_s : nil
408
+ end
409
+
410
+
411
+ # Returns the options for this class.
412
+ def options()
413
+ @options ||= {}
414
+ end
415
+
416
+
417
+ # Returns an array of rules defined for this class. You can use this
418
+ # array to change the order of rules.
419
+ def rules()
420
+ @rules ||= []
421
+ end
422
+
423
+
424
+ # Modifies this scraper to return a single value or a structure.
425
+ # Use in combination with accessors.
426
+ #
427
+ # When called with one symbol, scraping returns the result of
428
+ # calling that method (typically an accessor). When called with
429
+ # two or more symbols, scraping returns a structure of values,
430
+ # one for each symbol.
431
+ #
432
+ # For example:
433
+ # class ScrapeTitle < Scraper::Base
434
+ # process_first "html>head>title", :title=>:text
435
+ # result :title
436
+ # end
437
+ #
438
+ # puts "Title: " + ScrapeTitle.scrape(html)
439
+ #
440
+ # class ScrapeDts < Scraper::Base
441
+ # process ".dtstart", :dtstart=>["abbr@title", :text]
442
+ # process ".dtend", :dtend=>["abbr@title", :text]
443
+ # result :dtstart, :dtend
444
+ # end
445
+ #
446
+ # dts = ScrapeDts.scrape(html)
447
+ # puts "Starts: #{dts.dtstart}"
448
+ # puts "Ends: #{dts.dtend}"
449
+ def result(*symbols)
450
+ raise ArgumentError, "Use one symbol to return the value of this accessor, multiple symbols to returns a structure" if symbols.empty?
451
+ symbols = symbols.map {|s| s.to_sym}
452
+ if symbols.size == 1
453
+ define_method :result do
454
+ return self.send(symbols[0])
455
+ end
456
+ else
457
+ struct = Struct.new(*symbols)
458
+ define_method :result do
459
+ return struct.new(*symbols.collect {|s| self.send(s) })
460
+ end
461
+ end
462
+ end
463
+
464
+
465
+ # Declares which accessors are arrays. You can declare the
466
+ # accessor here, or use "symbol[]" as the target.
467
+ #
468
+ # For example:
469
+ # array :urls
470
+ # process "a[href]", :urls=>"@href"
471
+ # Is equivalent to:
472
+ # process "a[href]", "urls[]"=>"@href"
473
+ def array(*symbols)
474
+ @arrays ||= []
475
+ symbols.each do |symbol|
476
+ symbol = symbol.to_sym
477
+ @arrays << symbol
478
+ begin
479
+ self.instance_method(symbol)
480
+ rescue NameError
481
+ attr_accessor symbol
482
+ end
483
+ end
484
+ end
485
+
486
+
487
+ private
488
+
489
+
490
+ # Called by #process and #process_first, see there for
491
+ # documentation. First argument indicates whether to
492
+ # process only the first matching element (+true+) or
493
+ # all matching elements (+false+).
494
+ def create_process(first, *selector, &block)
495
+ # First argument may be the rule name.
496
+ name = selector.shift if selector.first.is_a?(Symbol)
497
+ # Extractor is either a block, last argument or both.
498
+ if selector.last.is_a?(Proc)
499
+ extractor = selector.pop
500
+ elsif selector.last.is_a?(Hash)
501
+ extractor = extractor(selector.pop)
502
+ end
503
+ if block && extractor
504
+ # Ugly, but no other way to chain two calls bound to the
505
+ # scraper instance.
506
+ define_method :__extractor, extractor
507
+ extractor1 = instance_method(:__extractor)
508
+ define_method :__extractor, block
509
+ extractor2 = instance_method(:__extractor)
510
+ remove_method :__extractor
511
+ extractor = lambda do |element|
512
+ extractor1.bind(self).call(element)
513
+ extractor2.bind(self).call(element)
514
+ end
515
+ elsif block
516
+ extractor = block
517
+ end
518
+ raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
519
+ # And if we think the extractor is the last argument,
520
+ # it's certainly not the selector.
521
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
522
+ if selector[0].is_a?(String)
523
+ selector = HTML::Selector.new(*selector)
524
+ else
525
+ raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
526
+ selector = selector[0]
527
+ end
528
+ # Create a method for fast evaluation.
529
+ define_method :__extractor, extractor
530
+ method = instance_method(:__extractor)
531
+ remove_method :__extractor
532
+ # Decide where to put the rule.
533
+ pos = rules.length
534
+ if name
535
+ if find = rules.find {|rule| rule[2] == name }
536
+ find[0] = selector
537
+ find[1] = method
538
+ else
539
+ rules << [selector, method, name, first]
540
+ end
541
+ else
542
+ rules << [selector, method, name, first]
543
+ end
544
+ end
545
+
546
+
547
+ # Returns a Proc that will extract a value from an element.
548
+ #
549
+ # The +source+ argument specifies which value to extract.
550
+ # See #extractor for more details.
551
+ #
552
+ # The Proc is called with an element and returns a value
553
+ # or +nil+.
554
+ def extract_value_from(source)
555
+ case source
556
+ when Array
557
+ # For an array, each item is itself a source argument.
558
+ # We stop at the first value we're able to extract.
559
+ array = source.collect { |i| extract_value_from(i) }
560
+ return lambda do |element|
561
+ result = nil
562
+ array.each { |proc| break if result = proc.call(element) }
563
+ result
564
+ end
565
+ when Hash
566
+ # For a hash, each pair is a symbol and source argument.
567
+ # We extract all the values and set them in the hash.
568
+ hash = source.inject({}) { |h,p| h[p[0]] = extract_value_from(p[1]) ; h }
569
+ return lambda do |element|
570
+ result = {}
571
+ hash.each_pair do |source, target|
572
+ if value = target.call(element)
573
+ result[source] = value
574
+ end
575
+ end
576
+ result unless result.empty?
577
+ end
578
+ when Class
579
+ # A class is a scraper we run on the extracted element.
580
+ # It must extend Scraper::Base.
581
+ klass = source
582
+ while klass = klass.superclass
583
+ break if klass == Scraper::Base
584
+ end
585
+ raise ArgumentError, "Class must be a scraper that extends Scraper::Base" unless klass
586
+ return lambda { |element| source.new(element).scrape }
587
+ when Symbol
588
+ # A symbol is a method we call. We pass it the element
589
+ # and it returns the extracted value. It must be a class method.
590
+ method = method(source) rescue
591
+ raise(ArgumentError, "No method #{source} in #{self.class}")
592
+ return lambda { |element| method.call(element) }
593
+ when Proc, Method
594
+ # Self evident.
595
+ raise ArgumentError, "Proc or Method must take one argument (an element)" if source.arity == 0
596
+ return source
597
+ when /^[\w\-:]+$/
598
+ # An element name. Return the element if the name matches.
599
+ return lambda { |element| element if element.name == source }
600
+ when /^@[\w\-:]+$/
601
+ # An attribute name. Return its value if the attribute is specified.
602
+ attr_name = source[1..-1]
603
+ return lambda { |element| element.attributes[attr_name] }
604
+ when /^[\w\-:]+@[\w\-:]+$/
605
+ # An element with attribute name. Return the attribute value if
606
+ # the attribute is specified, and the element name matches.
607
+ tag_name, attr_name = source.match(/^([\w\-:]+)@([\w\-:]+)$/)[1..2]
608
+ return lambda do |element|
609
+ element.attributes[attr_name] if element.name == tag_name
610
+ end
611
+ else
612
+ return lambda { |element| source }
613
+ # Anything else and pianos fall from the sky.
614
+ raise ArgumentError, "Invalid extractor #{source.to_s}"
615
+ end
616
+ end
617
+
618
+
619
+ # Returns a Proc that will set the extract value in the object.
620
+ #
621
+ # The +target+ argument identifies an instance variable. It may
622
+ # be the name of a variable, or the name of a variable prefixed
623
+ # with [] to denote an array.
624
+ #
625
+ # The Proc is called with two arguments: the object to set the
626
+ # value in, and the value.
627
+ def extract_value_to(target)
628
+ if target.is_a?(Array)
629
+ setters = target.collect do |target|
630
+ [target,extract_value_to(target)]
631
+ end
632
+ return lambda do |object,value|
633
+ setters.each do |setter|
634
+ setter[1].call(object, value.send(setter[0]))
635
+ end
636
+ end
637
+ end
638
+
639
+ if target.to_sym == :skip
640
+ return lambda do |object, value|
641
+ object.send(:skip, value)
642
+ end
643
+ end
644
+
645
+ target = target.to_s
646
+ if target[-2..-1] == "[]" or (@arrays && array = @arrays.include?(target.to_sym))
647
+ target = target[0...-2] unless array
648
+ # Create an attribute accessor is not already defined.
649
+ begin
650
+ self.instance_method(target)
651
+ rescue NameError
652
+ attr_accessor target
653
+ end
654
+ reader = "#{target}".to_sym
655
+ writer = "#{target}=".to_sym
656
+ return lambda do |object, value|
657
+ array = object.send(reader)
658
+ object.send(writer, array = []) unless array
659
+ array << value
660
+ end
661
+ else
662
+ # Create an attribute accessor is not already defined.
663
+ begin
664
+ self.instance_method(target)
665
+ rescue NameError
666
+ instance = "@#{target}".to_sym
667
+ attr_accessor target
668
+ end
669
+ reader = "#{target}=".to_sym
670
+ return lambda { |object, value| object.send(reader, value) }
671
+ end
672
+ end
673
+
674
+
675
+ def inherited(child)
676
+ super
677
+ # Duplicate options, rules and arrays rules to any inherited class.
678
+ child.options.update self.options
679
+ child.rules.concat self.rules
680
+ child.instance_variable_set :@arrays, self.instance_variable_get(:@arrays)
681
+ end
682
+
683
+ end
684
+
685
+
686
+ unless const_defined? :READER_OPTIONS
687
+ READER_OPTIONS = [:last_modified, :etag, :redirect_limit, :user_agent, :timeout]
688
+ end
689
+
690
+
691
+ # Set to true when the first extractor returns true.
692
+ attr_accessor :extracted
693
+
694
+
695
+ # Information about the HTML page scraped. See PageInfo.
696
+ attr_accessor :page_info
697
+
698
+
699
+ # Returns the options for this object.
700
+ attr_accessor :options
701
+
702
+
703
+ # Create a new scraper instance.
704
+ #
705
+ # The argument +source+ is a URL, string containing HTML, or HTML::Node.
706
+ # The optional argument +options+ are options passed to the scraper.
707
+ # See Base#scrape for more details.
708
+ #
709
+ # For example:
710
+ # # The page we want to scrape
711
+ # url = URI.parse("http://example.com")
712
+ # # Skip the header
713
+ # scraper = MyScraper.new(url, :root_element=>"body")
714
+ # result = scraper.scrape
715
+ def initialize(source, options = nil)
716
+ @page_info = PageInfo[]
717
+ @options = options || {}
718
+ case source
719
+ when URI
720
+ @document = source
721
+ when String, HTML::Node
722
+ @document = source
723
+ # TODO: document and test case these two.
724
+ @page_info.url = @page_info.original_url = @options[:url]
725
+ @page_info.encoding = @options[:encoding]
726
+ else
727
+ raise ArgumentError, "Can only scrape URI, String or HTML::Node"
728
+ end
729
+ end
730
+
731
+
732
+ # Scrapes the document and returns the result.
733
+ #
734
+ # If the scraper was created with a URL, retrieve the page and parse it.
735
+ # If the scraper was created with a string, parse the page.
736
+ #
737
+ # The result is returned by calling the #result method. The default
738
+ # implementation returns +self+ if any extractor returned true,
739
+ # +nil+ otherwise.
740
+ #
741
+ # The method may raise any number of exceptions. HTTPError indicates
742
+ # it failed to retrieve the HTML page, and HTMLParseError that it failed
743
+ # to parse the page. Other exceptions come from extractors and the
744
+ # #result method.
745
+ #
746
+ # See also Base#scrape.
747
+ def scrape()
748
+ # Call prepare with the document, but before doing anything else.
749
+ prepare document
750
+ # Retrieve the document. This may raise HTTPError or HTMLParseError.
751
+ case document
752
+ when Array
753
+ stack = @document.reverse # see below
754
+ when HTML::Node
755
+ # If a root element is specified, start selecting from there.
756
+ # The stack is empty if we can't find any root element (makes
757
+ # sense). However, the node we're going to process may be
758
+ # a tag, or an HTML::Document.root which is the equivalent of
759
+ # a document fragment.
760
+ root_element = option(:root_element)
761
+ root = root_element ? @document.find(:tag=>root_element) : @document
762
+ stack = root ? (root.tag? ? [root] : root.children.reverse) : []
763
+ else
764
+ return
765
+ end
766
+ # @skip stores all the elements we want to skip (see #skip).
767
+ # rules stores all the rules we want to process with this
768
+ # scraper, based on the class definition.
769
+ @skip = []
770
+ @stop = false
771
+ rules = self.class.rules.clone
772
+ begin
773
+ # Process the document one node at a time. We process elements
774
+ # from the end of the stack, so each time we visit child elements,
775
+ # we add them to the end of the stack in reverse order.
776
+ while node = stack.pop
777
+ break if @stop
778
+ skip_this = false
779
+ # Only match nodes that are elements, ignore text nodes.
780
+ # Also ignore any element that's on the skip list, and if
781
+ # found one, remove it from the list (since we never visit
782
+ # the same element twice). But an element may be added twice
783
+ # to the skip list.
784
+ # Note: equal? is faster than == for nodes.
785
+ next unless node.tag?
786
+ @skip.delete_if { |s| skip_this = true if s.equal?(node) }
787
+ next if skip_this
788
+
789
+ # Run through all the rules until we process the element or
790
+ # run out of rules. If skip_this=true then we processed the
791
+ # element and we can break out of the loop. However, we might
792
+ # process (and skip) descedants so also watch the skip list.
793
+ rules.delete_if do |selector, extractor, rule_name, first_only|
794
+ break if skip_this
795
+ # The result of calling match (selected) is nil, element
796
+ # or array of elements. We turn it into an array to
797
+ # process one element at a time. We process all elements
798
+ # that are not on the skip list (we haven't visited
799
+ # them yet).
800
+ if selected = selector.match(node, first_only)
801
+ selected = [selected] unless selected.is_a?(Array)
802
+ selected = [selected.first] if first_only
803
+ selected.each do |element|
804
+ # Do not process elements we already skipped
805
+ # (see above). However, this time we may visit
806
+ # an element twice, since selected elements may
807
+ # be descendants of the current element on the
808
+ # stack. In rare cases two elements on the stack
809
+ # may pick the same descendants.
810
+ next if @skip.find { |s| s.equal?(element) }
811
+ # Call the extractor method with this element.
812
+ # If it returns true, skip the element and if
813
+ # the current element, don't process any more
814
+ # rules. Again, pay attention to descendants.
815
+ if extractor.bind(self).call(element)
816
+ @extracted = true
817
+ end
818
+ if @skip.delete(true)
819
+ if element.equal?(node)
820
+ skip_this = true
821
+ else
822
+ @skip << element
823
+ end
824
+ end
825
+ end
826
+ first_only if !selected.empty?
827
+ end
828
+ end
829
+
830
+ # If we did not skip the element, we're going to process its
831
+ # children. Reverse order since we're popping from the stack.
832
+ if !skip_this && children = node.children
833
+ stack.concat children.reverse
834
+ end
835
+ end
836
+ ensure
837
+ @skip = nil
838
+ end
839
+ collect
840
+ return result
841
+ end
842
+
843
+
844
+ # Returns the document being processed.
845
+ #
846
+ # If the scraper was created with a URL, this method will attempt to
847
+ # retrieve the page and parse it.
848
+ #
849
+ # If the scraper was created with a string, this method will attempt
850
+ # to parse the page.
851
+ #
852
+ # Be advised that calling this method may raise an exception
853
+ # (HTTPError or HTMLParseError).
854
+ #
855
+ # The document is parsed only the first time this method is called.
856
+ def document
857
+ if @document.is_a?(URI)
858
+ # Attempt to read page. May raise HTTPError.
859
+ options = {}
860
+ READER_OPTIONS.each { |key| options[key] = option(key) }
861
+ request(@document, options)
862
+ end
863
+ if @document.is_a?(String)
864
+ # Parse the page. May raise HTMLParseError.
865
+ parsed = Reader.parse_page(@document, @page_info.encoding,
866
+ option(:parser_options), option(:parser))
867
+ @document = parsed.document
868
+ @page_info.encoding = parsed.encoding
869
+ end
870
+ return @document if @document.is_a?(HTML::Node)
871
+ raise RuntimeError, "No document to process"
872
+ end
873
+
874
+
875
+ def request(url, options)
876
+ if page = Reader.read_page(@document, options)
877
+ @page_info.url = page.url
878
+ @page_info.original_url = @document
879
+ @page_info.last_modified = page.last_modified
880
+ @page_info.etag = page.etag
881
+ @page_info.encoding = page.encoding
882
+ @document = page.content
883
+ end
884
+ end
885
+
886
+
887
+ # :call-seq:
888
+ # skip() => true
889
+ # skip(element) => true
890
+ # skip([element ...]) => true
891
+ #
892
+ # Skips processing the specified element(s).
893
+ #
894
+ # If called with a single element, that element will not be processed.
895
+ #
896
+ # If called with an array of elements, all the elements in the array
897
+ # are skipped.
898
+ #
899
+ # If called with no element, skips processing the current element.
900
+ # This has the same effect as returning true.
901
+ #
902
+ # For convenience this method always returns true. For example:
903
+ # process "h1" do |element|
904
+ # @header = element
905
+ # skip
906
+ # end
907
+ def skip(elements = nil)
908
+ case elements
909
+ when Array: @skip.concat elements
910
+ when HTML::Node: @skip << elements
911
+ when nil: @skip << true
912
+ when true, false: @skip << elements
913
+ end
914
+ # Calling skip(element) as the last statement is
915
+ # redundant by design.
916
+ return true
917
+ end
918
+
919
+
920
+ # Stops processing this page. You can call this early on if you
921
+ # discover there is no interesting information on the page, or
922
+ # done extracting all useful information.
923
+ def stop()
924
+ @stop = true
925
+ end
926
+
927
+
928
+ # Called by #scrape after creating the document, but before running
929
+ # any processing rules.
930
+ #
931
+ # You can override this method to do any preparation work.
932
+ def prepare(document)
933
+ end
934
+
935
+
936
+ # Called by #scrape scraping the document, and before calling #result.
937
+ # Typically used to run any validation, post-processing steps,
938
+ # resolving referenced elements, etc.
939
+ def collect()
940
+ end
941
+
942
+
943
+ # Returns the result of a succcessful scrape.
944
+ #
945
+ # This method is called by #scrape after running all the rules on the
946
+ # document. You can also call it directly.
947
+ #
948
+ # Override this method to return a specific object, perform post-scraping
949
+ # processing, validation, etc.
950
+ #
951
+ # The default implementation returns +self+ if any extractor returned
952
+ # true, +nil+ otherwise.
953
+ #
954
+ # If you override this method, implement your own logic to determine
955
+ # if anything was extracted and return +nil+ otherwise. Also, make sure
956
+ # calling this method multiple times returns the same result.
957
+ def result()
958
+ return self if @extracted
959
+ end
960
+
961
+
962
+ # Returns the value of an option.
963
+ #
964
+ # Returns the value of an option passed to the scraper on creation.
965
+ # If not specified, return the value of the option set for this
966
+ # scraper class. Options are inherited from the parent class.
967
+ def option(symbol)
968
+ return options.has_key?(symbol) ? options[symbol] : self.class.options[symbol]
969
+ end
970
+
971
+
972
+ end
973
+
974
+
975
+ # Define an anonymous scraper and returns the class.
976
+ #
977
+ # For example:
978
+ # links = Scraper.define do
979
+ # process "a[href]", :urls=>"@href"
980
+ # result :urls
981
+ # end
982
+ #
983
+ # puts links.scrape(html)
984
+ def self.define(&block)
985
+ kls = Class.new(Scraper::Base)
986
+ kls.module_eval &block
987
+ return kls
988
+ end
989
+
990
+ end