assaf-scrapi 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +36 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +88 -0
- data/Rakefile +33 -0
- data/lib/html/document.rb +64 -0
- data/lib/html/htmlparser.rb +407 -0
- data/lib/html/node.rb +534 -0
- data/lib/html/node_ext.rb +86 -0
- data/lib/html/selector.rb +825 -0
- data/lib/html/tokenizer.rb +105 -0
- data/lib/html/version.rb +11 -0
- data/lib/scraper/base.rb +990 -0
- data/lib/scraper/microformats.rb +93 -0
- data/lib/scraper/reader.rb +240 -0
- data/lib/scrapi.rb +8 -0
- data/lib/tidy/libtidy.dll +0 -0
- data/lib/tidy/libtidy.so +0 -0
- data/test/mock_net_http.rb +54 -0
- data/test/node_ext_test.rb +24 -0
- data/test/reader_test.rb +318 -0
- data/test/scraper_test.rb +804 -0
- data/test/selector_test.rb +637 -0
- metadata +89 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module HTML #:nodoc:
|
4
|
+
|
5
|
+
# A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
|
6
|
+
# token is a string. Each string represents either "text", or an HTML element.
|
7
|
+
#
|
8
|
+
# This currently assumes valid XHTML, which means no free < or > characters.
|
9
|
+
#
|
10
|
+
# Usage:
|
11
|
+
#
|
12
|
+
# tokenizer = HTML::Tokenizer.new(text)
|
13
|
+
# while token = tokenizer.next
|
14
|
+
# p token
|
15
|
+
# end
|
16
|
+
class Tokenizer #:nodoc:
|
17
|
+
|
18
|
+
# The current (byte) position in the text
|
19
|
+
attr_reader :position
|
20
|
+
|
21
|
+
# The current line number
|
22
|
+
attr_reader :line
|
23
|
+
|
24
|
+
# Create a new Tokenizer for the given text.
|
25
|
+
def initialize(text)
|
26
|
+
@scanner = StringScanner.new(text)
|
27
|
+
@position = 0
|
28
|
+
@line = 0
|
29
|
+
@current_line = 1
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return the next token in the sequence, or +nil+ if there are no more tokens in
|
33
|
+
# the stream.
|
34
|
+
def next
|
35
|
+
return nil if @scanner.eos?
|
36
|
+
@position = @scanner.pos
|
37
|
+
@line = @current_line
|
38
|
+
if @scanner.check(/<\S/)
|
39
|
+
update_current_line(scan_tag)
|
40
|
+
else
|
41
|
+
update_current_line(scan_text)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# Treat the text at the current position as a tag, and scan it. Supports
|
48
|
+
# comments, doctype tags, and regular tags, and ignores less-than and
|
49
|
+
# greater-than characters within quoted strings.
|
50
|
+
def scan_tag
|
51
|
+
tag = @scanner.getch
|
52
|
+
if @scanner.scan(/!--/) # comment
|
53
|
+
tag << @scanner.matched
|
54
|
+
tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
|
55
|
+
elsif @scanner.scan(/!\[CDATA\[/)
|
56
|
+
tag << @scanner.matched
|
57
|
+
tag << @scanner.scan_until(/\]\]>/)
|
58
|
+
elsif @scanner.scan(/!/) # doctype
|
59
|
+
tag << @scanner.matched
|
60
|
+
tag << consume_quoted_regions
|
61
|
+
else
|
62
|
+
tag << consume_quoted_regions
|
63
|
+
end
|
64
|
+
tag
|
65
|
+
end
|
66
|
+
|
67
|
+
# Scan all text up to the next < character and return it.
|
68
|
+
def scan_text
|
69
|
+
"#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Counts the number of newlines in the text and updates the current line
|
73
|
+
# accordingly.
|
74
|
+
def update_current_line(text)
|
75
|
+
text.scan(/\r?\n/) { @current_line += 1 }
|
76
|
+
end
|
77
|
+
|
78
|
+
# Skips over quoted strings, so that less-than and greater-than characters
|
79
|
+
# within the strings are ignored.
|
80
|
+
def consume_quoted_regions
|
81
|
+
text = ""
|
82
|
+
loop do
|
83
|
+
match = @scanner.scan_until(/['"<>]/) or break
|
84
|
+
|
85
|
+
delim = @scanner.matched
|
86
|
+
if delim == "<"
|
87
|
+
match = match.chop
|
88
|
+
@scanner.pos -= 1
|
89
|
+
end
|
90
|
+
|
91
|
+
text << match
|
92
|
+
break if delim == "<" || delim == ">"
|
93
|
+
|
94
|
+
# consume the quoted region
|
95
|
+
while match = @scanner.scan_until(/[\\#{delim}]/)
|
96
|
+
text << match
|
97
|
+
break if @scanner.matched == delim
|
98
|
+
text << @scanner.getch # skip the escaped character
|
99
|
+
end
|
100
|
+
end
|
101
|
+
text
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/html/version.rb
ADDED
data/lib/scraper/base.rb
ADDED
@@ -0,0 +1,990 @@
|
|
1
|
+
# ScrAPI toolkit for Ruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
|
4
|
+
# Developed for http://co.mments.com
|
5
|
+
# Code and documention: http://labnotes.org
|
6
|
+
|
7
|
+
|
8
|
+
require "rubygems"
|
9
|
+
require File.join(File.dirname(__FILE__), "reader")
|
10
|
+
|
11
|
+
|
12
|
+
module Scraper
|
13
|
+
|
14
|
+
class Base
|
15
|
+
|
16
|
+
|
17
|
+
# Information about the HTML page scraped. A structure with the following
|
18
|
+
# attributes:
|
19
|
+
# * <tt>url</tt> -- The URL of the document being scraped. Passed in
|
20
|
+
# the constructor but may have changed if the page was redirected.
|
21
|
+
# * <tt>original_url</tt> -- The original URL of the document being
|
22
|
+
# scraped as passed in the constructor.
|
23
|
+
# * <tt>encoding</tt> -- The encoding of the document.
|
24
|
+
# * <tt>last_modified</tt> -- Value of the Last-Modified header returned
|
25
|
+
# from the server.
|
26
|
+
# * <tt>etag</tt> -- Value of the Etag header returned from the server.
|
27
|
+
PageInfo = Struct.new(:url, :original_url, :encoding, :last_modified, :etag)
|
28
|
+
|
29
|
+
|
30
|
+
class << self
|
31
|
+
|
32
|
+
# :call-seq:
|
33
|
+
# process(symbol?, selector, values?, extractor)
|
34
|
+
# process(symbol?, selector, values?) { |element| ... }
|
35
|
+
#
|
36
|
+
# Defines a processing rule. A processing rule consists of a selector
|
37
|
+
# that matches element, and an extractor that does something interesting
|
38
|
+
# with their value.
|
39
|
+
#
|
40
|
+
# == Symbol
|
41
|
+
#
|
42
|
+
# Rules are processed in the order in which they are defined. Use #rules
|
43
|
+
# if you need to change the order of processing.
|
44
|
+
#
|
45
|
+
# Rules can be named or anonymous. If the first argument is a symbol,
|
46
|
+
# it is used as the rule name. You can use the rule name to position,
|
47
|
+
# remove or replace it.
|
48
|
+
#
|
49
|
+
# == Selector
|
50
|
+
#
|
51
|
+
# The first argument is a selector. It selects elements from the document
|
52
|
+
# that are potential candidates for extraction. Each selected element is
|
53
|
+
# passed to the extractor.
|
54
|
+
#
|
55
|
+
# The +selector+ argument may be a string, an HTML::Selector object or
|
56
|
+
# any object that responds to the +select+ method. Passing an Array
|
57
|
+
# (responds to +select+) will not do anything useful.
|
58
|
+
#
|
59
|
+
# String selectors support value substitution, replacing question marks
|
60
|
+
# (?) in the selector expression with values from the method arguments.
|
61
|
+
# See HTML::Selector for more information.
|
62
|
+
#
|
63
|
+
# == Extractor
|
64
|
+
#
|
65
|
+
# The last argument or block is the extractor. The extractor does
|
66
|
+
# something interested with the selected element, typically assigns
|
67
|
+
# it to an instance variable of the scraper.
|
68
|
+
#
|
69
|
+
# Since the extractor is called on the scraper, it can also use the
|
70
|
+
# scraper to maintain state, e.g. this extractor counts how many
|
71
|
+
# +div+ elements appear in the document:
|
72
|
+
# process "div" { |element| @count += 1 }
|
73
|
+
#
|
74
|
+
# The extractor returns +true+ if the element was processed and
|
75
|
+
# should not be passed to any other extractor (including any child
|
76
|
+
# elements).
|
77
|
+
#
|
78
|
+
# The default implementation of #result returns +self+ only if at
|
79
|
+
# least one extractor returned +true+. However, you can override
|
80
|
+
# #result and use extractors that return +false+.
|
81
|
+
#
|
82
|
+
# A block extractor is called with a single element.
|
83
|
+
#
|
84
|
+
# You can also use the #extractor method to create extractors that
|
85
|
+
# assign elements, attributes and text values to instance variables,
|
86
|
+
# or pass a +Hash+ as the last argument to #process. See #extractor
|
87
|
+
# for more information.
|
88
|
+
#
|
89
|
+
# When using a block, the last statement is the response. Do not use
|
90
|
+
# +return+, use +next+ if you want to return a value before the last
|
91
|
+
# statement. +return+ does not do what you expect it to.
|
92
|
+
#
|
93
|
+
# == Example
|
94
|
+
#
|
95
|
+
# class ScrapePosts < Scraper::Base
|
96
|
+
# # Select the title of a post
|
97
|
+
# selector :select_title, "h2"
|
98
|
+
#
|
99
|
+
# # Select the body of a post
|
100
|
+
# selector :select_body, ".body"
|
101
|
+
#
|
102
|
+
# # All elements with class name post.
|
103
|
+
# process ".post" do |element|
|
104
|
+
# title = select_title(element)
|
105
|
+
# body = select_body(element)
|
106
|
+
# @posts << Post.new(title, body)
|
107
|
+
# true
|
108
|
+
# end
|
109
|
+
#
|
110
|
+
# attr_reader :posts
|
111
|
+
# end
|
112
|
+
#
|
113
|
+
# posts = ScrapePosts.scrape(html).posts
|
114
|
+
#
|
115
|
+
# To process only a single element:
|
116
|
+
#
|
117
|
+
# class ScrapeTitle < Scraper::Base
|
118
|
+
# process "html>head>title", :title=>text
|
119
|
+
# result :title
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
# puts ScrapeTitle.scrape(html)
|
123
|
+
def process(*selector, &block)
|
124
|
+
create_process(false, *selector, &block)
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
# Similar to #process, but only extracts from the first
|
129
|
+
# selected element. Faster if you know the document contains
|
130
|
+
# only one applicable element, or only interested in processing
|
131
|
+
# the first one.
|
132
|
+
def process_first(*selector, &block)
|
133
|
+
create_process(true, *selector, &block)
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# :call-seq:
|
138
|
+
# selector(symbol, selector, values?)
|
139
|
+
# selector(symbol, selector, values?) { |elements| ... }
|
140
|
+
#
|
141
|
+
# Create a selector method. You can call a selector method directly
|
142
|
+
# to select elements.
|
143
|
+
#
|
144
|
+
# For example, define a selector:
|
145
|
+
# selector :five_divs, "div" { |elems| elems[0..4] }
|
146
|
+
# And call it to retrieve the first five +div+ elements:
|
147
|
+
# divs = five_divs(element)
|
148
|
+
#
|
149
|
+
# Call a selector method with an element and it returns an array of
|
150
|
+
# elements that match the selector, beginning with the element argument
|
151
|
+
# itself. It returns an empty array if nothing matches.
|
152
|
+
#
|
153
|
+
# If the selector is defined with a block, all selected elements are
|
154
|
+
# passed to the block and the result of the block is returned.
|
155
|
+
#
|
156
|
+
# For convenience, a <tt>first_</tt> method is also created that
|
157
|
+
# returns (and yields) only the first selected element. For example:
|
158
|
+
# selector :post, "#post"
|
159
|
+
# @post = first_post
|
160
|
+
#
|
161
|
+
# Since the selector is defined with a block, both methods call that
|
162
|
+
# block with an array of elements.
|
163
|
+
#
|
164
|
+
# The +selector+ argument may be a string, an HTML::Selector object or
|
165
|
+
# any object that responds to the +select+ method. Passing an Array
|
166
|
+
# (responds to +select+) will not do anything useful.
|
167
|
+
#
|
168
|
+
# String selectors support value substitution, replacing question marks
|
169
|
+
# (?) in the selector expression with values from the method arguments.
|
170
|
+
# See HTML::Selector for more information.
|
171
|
+
#
|
172
|
+
# When using a block, the last statement is the response. Do not use
|
173
|
+
# +return+, use +next+ if you want to return a value before the last
|
174
|
+
# statement. +return+ does not do what you expect it to.
|
175
|
+
def selector(symbol, *selector, &block)
|
176
|
+
raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
|
177
|
+
if selector[0].is_a?(String)
|
178
|
+
selector = HTML::Selector.new(*selector)
|
179
|
+
else
|
180
|
+
raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
|
181
|
+
selector = selector[0]
|
182
|
+
end
|
183
|
+
if block
|
184
|
+
define_method symbol do |element|
|
185
|
+
selected = selector.select(element)
|
186
|
+
return block.call(selected) unless selected.empty?
|
187
|
+
end
|
188
|
+
define_method "first_#{symbol}" do |element|
|
189
|
+
selected = selector.select_first(element)
|
190
|
+
return block.call([selected]) if selected
|
191
|
+
end
|
192
|
+
else
|
193
|
+
define_method symbol do |element|
|
194
|
+
return selector.select(element)
|
195
|
+
end
|
196
|
+
define_method "first_#{symbol}" do |element|
|
197
|
+
return selector.select_first(element)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
# Creates an extractor that will extract values from the selected
|
204
|
+
# element and place them in instance variables of the scraper.
|
205
|
+
# You can pass the result to #process.
|
206
|
+
#
|
207
|
+
# == Example
|
208
|
+
#
|
209
|
+
# This example processes a document looking for an element with the
|
210
|
+
# class name +article+. It extracts the attribute +id+ and stores it
|
211
|
+
# in the instance variable +@id+. It extracts the article node itself
|
212
|
+
# and puts it in the instance variable +@article+.
|
213
|
+
#
|
214
|
+
# class ArticleScraper < Scraper::Base
|
215
|
+
# process ".article", extractor(:id=>"@id", :article=>:element)
|
216
|
+
# attr_reader :id, :article
|
217
|
+
# end
|
218
|
+
# result = ArticleScraper.scrape(html)
|
219
|
+
# puts result.id
|
220
|
+
# puts result.article
|
221
|
+
#
|
222
|
+
# == Sources
|
223
|
+
#
|
224
|
+
# Extractors operate on the selected element, and can extract the
|
225
|
+
# following values:
|
226
|
+
# * <tt>"elem_name"</tt> -- Extracts the element itself if it
|
227
|
+
# matches the element name (e.g. "h2" will extract only level 2
|
228
|
+
# header elements).
|
229
|
+
# * <tt>"attr_name"</tt> -- Extracts the attribute value from the
|
230
|
+
# element if specified (e.g. "@id" will extract the id attribute).
|
231
|
+
# * <tt>"elem_name@attr_name"</tt> -- Extracts the attribute value
|
232
|
+
# from the element if specified, but only if the element has the
|
233
|
+
# specified name (e.g. "h2@id").
|
234
|
+
# * <tt>:element</tt> -- Extracts the element itself.
|
235
|
+
# * <tt>:text</tt> -- Extracts the text value of the node.
|
236
|
+
# * <tt>Scraper</tt> -- Using this class creates a scraper to
|
237
|
+
# process the current element and extract the result. This can
|
238
|
+
# be used for handling complex structure.
|
239
|
+
#
|
240
|
+
# If you use an array of sources, the first source that matches
|
241
|
+
# anything is used. For example, <tt>["attr@title", :text]</tt>
|
242
|
+
# extracts the value of the +title+ attribute if the element is
|
243
|
+
# +abbr+, otherwise the text value of the element.
|
244
|
+
#
|
245
|
+
# If you use a hash, you can extract multiple values at the same
|
246
|
+
# time. For example, <tt>{:id=>"@id", :class=>"@class"}</tt>
|
247
|
+
# extracts the +id+ and +class+ attribute values.
|
248
|
+
#
|
249
|
+
# :element and :text are special cases of symbols. You can pass any
|
250
|
+
# symbol that matches a class method and that class method will
|
251
|
+
# be called to extract a value from the selected element.
|
252
|
+
# You can also pass a Proc or Method directly.
|
253
|
+
#
|
254
|
+
# And it's always possible to pass a static value, quite useful for
|
255
|
+
# processing an element with more than one rule (<tt>:skip=>false</tt>).
|
256
|
+
#
|
257
|
+
# == Targets
|
258
|
+
#
|
259
|
+
# Extractors assign the extracted value to an instance variable
|
260
|
+
# of the scraper. The instance variable contains the last value
|
261
|
+
# extracted.
|
262
|
+
#
|
263
|
+
# Also creates an accessor for that instance variable. An accessor
|
264
|
+
# is created if no such method exists. For example,
|
265
|
+
# <tt>:title=>:text</tt> creates an accessor for +title+. However,
|
266
|
+
# <tt>:id=>"@id"</tt> does not create an accessor since each
|
267
|
+
# object already has a method called +id+.
|
268
|
+
#
|
269
|
+
# If you want to extract multiple values into the same variables,
|
270
|
+
# use #array to declare that accessor as an array.
|
271
|
+
#
|
272
|
+
# Alternatively, you can append <tt>[]</tt> to the variable name.
|
273
|
+
# For example:
|
274
|
+
# process "*", "ids[]"=>"@id"
|
275
|
+
# result :ids
|
276
|
+
#
|
277
|
+
# The special target <tt>:skip</tt> allows you to control whether
|
278
|
+
# other rules can apply to the same element. By default a processing
|
279
|
+
# rule without a block (or a block that returns true) will skip
|
280
|
+
# that element so no other processing rule sees it.
|
281
|
+
#
|
282
|
+
# You can change this with <tt>:skip=>false</tt>.
|
283
|
+
def extractor(map)
|
284
|
+
extracts = []
|
285
|
+
map.each_pair do |target, source|
|
286
|
+
source = extract_value_from(source)
|
287
|
+
target = extract_value_to(target)
|
288
|
+
define_method :__extractor do |element|
|
289
|
+
value = source.call(element)
|
290
|
+
target.call(self, value) if !value.nil?
|
291
|
+
end
|
292
|
+
extracts << instance_method(:__extractor)
|
293
|
+
remove_method :__extractor
|
294
|
+
end
|
295
|
+
lambda do |element|
|
296
|
+
extracts.each do |extract|
|
297
|
+
extract.bind(self).call(element)
|
298
|
+
end
|
299
|
+
true
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
|
304
|
+
# Scrapes the document and returns the result.
|
305
|
+
#
|
306
|
+
# The first argument provides the input document. It can be one of:
|
307
|
+
# * <tt>URI</tt> -- Retrieve an HTML page from this URL and
|
308
|
+
# scrape it.
|
309
|
+
# * <tt>String</tt> -- The HTML page as a string.
|
310
|
+
# * <tt>HTML::Node</tt> -- An HTML node, can be a document
|
311
|
+
# or element.
|
312
|
+
#
|
313
|
+
# You can specify options for the scraper class, or override
|
314
|
+
# these by passing options in the second argument. Some options
|
315
|
+
# only make sense in the constructor.
|
316
|
+
#
|
317
|
+
# The following options are supported for reading HTML pages:
|
318
|
+
# * <tt>:last_modified</tt> -- Last-Modified header used for
|
319
|
+
# caching.
|
320
|
+
# * <tt>:etag</tt> -- ETag header used for caching.
|
321
|
+
# * <tt>:redirect_limit</tt> -- Limits number of redirects
|
322
|
+
# to follow.
|
323
|
+
# * <tt>:user_agent</tt> -- Value for User-Agent header.
|
324
|
+
# * <tt>:timeout</tt> -- HTTP open connection/read timeouts
|
325
|
+
# (in second).
|
326
|
+
#
|
327
|
+
# The following options are supported for parsing the HTML:
|
328
|
+
# * <tt>:root_element</tt> -- The root element to scrape, see
|
329
|
+
# also #root_elements.
|
330
|
+
# * <tt>:parser</tt> -- Specifies which parser to use.
|
331
|
+
# (Typically, you set this for the class).
|
332
|
+
# * <tt>:parser_options</tt> -- Options to pass to the parser.
|
333
|
+
#
|
334
|
+
# The result is returned by calling the #result method.
|
335
|
+
# The default implementation returns +self+ if any extractor
|
336
|
+
# returned true, +nil+ otherwise.
|
337
|
+
#
|
338
|
+
# For example:
|
339
|
+
# result = MyScraper.scrape(url, :root_element=>"body")
|
340
|
+
#
|
341
|
+
# The method may raise any number of exceptions. HTTPError
|
342
|
+
# indicates it failed to retrieve the HTML page, and HTMLParseError
|
343
|
+
# that it failed to parse the page. Other exceptions come from
|
344
|
+
# extractors and the #result method.
|
345
|
+
def scrape(source, options = nil)
|
346
|
+
scraper = self.new(source, options);
|
347
|
+
return scraper.scrape
|
348
|
+
end
|
349
|
+
|
350
|
+
|
351
|
+
# Returns the text of the element.
|
352
|
+
#
|
353
|
+
# You can use this method from an extractor, e.g.:
|
354
|
+
# process "title", :title=>:text
|
355
|
+
def text(element)
|
356
|
+
text = ""
|
357
|
+
stack = element.children.reverse
|
358
|
+
while node = stack.pop
|
359
|
+
if node.tag?
|
360
|
+
stack.concat node.children.reverse
|
361
|
+
else
|
362
|
+
text << node.content
|
363
|
+
end
|
364
|
+
end
|
365
|
+
return text
|
366
|
+
end
|
367
|
+
|
368
|
+
|
369
|
+
# Returns the element itself.
|
370
|
+
#
|
371
|
+
# You can use this method from an extractor, e.g.:
|
372
|
+
# process "h1", :header=>:element
|
373
|
+
def element(element)
|
374
|
+
element
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
# Specifies which parser to use. The default is +:tidy+.
|
379
|
+
def parser(name = :tidy)
|
380
|
+
self.options[:parser] = name
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
# Options to pass to the parser.
|
385
|
+
#
|
386
|
+
# For example, when using Tidy, you can use these options to
|
387
|
+
# tell Tidy how to clean up the HTML.
|
388
|
+
#
|
389
|
+
# This method sets the option for the class. Classes inherit options
|
390
|
+
# from their parents. You can also pass options to the scraper object
|
391
|
+
# itself using the +:parser_options+ option.
|
392
|
+
def parser_options(options)
|
393
|
+
self.options[:parser_options] = options
|
394
|
+
end
|
395
|
+
|
396
|
+
|
397
|
+
# The root element to scrape.
|
398
|
+
#
|
399
|
+
# The root element for an HTML document is +html+. However, if you want
|
400
|
+
# to scrape only the header or body, you can set the root_element to
|
401
|
+
# +head+ or +body+.
|
402
|
+
#
|
403
|
+
# This method sets the root element for the class. Classes inherit
|
404
|
+
# this option from their parents. You can also pass a root element
|
405
|
+
# to the scraper object itself using the +:root_element+ option.
|
406
|
+
def root_element(name)
|
407
|
+
self.options[:root_element] = name ? name.to_s : nil
|
408
|
+
end
|
409
|
+
|
410
|
+
|
411
|
+
# Returns the options for this class.
|
412
|
+
def options()
|
413
|
+
@options ||= {}
|
414
|
+
end
|
415
|
+
|
416
|
+
|
417
|
+
# Returns an array of rules defined for this class. You can use this
|
418
|
+
# array to change the order of rules.
|
419
|
+
def rules()
|
420
|
+
@rules ||= []
|
421
|
+
end
|
422
|
+
|
423
|
+
|
424
|
+
# Modifies this scraper to return a single value or a structure.
|
425
|
+
# Use in combination with accessors.
|
426
|
+
#
|
427
|
+
# When called with one symbol, scraping returns the result of
|
428
|
+
# calling that method (typically an accessor). When called with
|
429
|
+
# two or more symbols, scraping returns a structure of values,
|
430
|
+
# one for each symbol.
|
431
|
+
#
|
432
|
+
# For example:
|
433
|
+
# class ScrapeTitle < Scraper::Base
|
434
|
+
# process_first "html>head>title", :title=>:text
|
435
|
+
# result :title
|
436
|
+
# end
|
437
|
+
#
|
438
|
+
# puts "Title: " + ScrapeTitle.scrape(html)
|
439
|
+
#
|
440
|
+
# class ScrapeDts < Scraper::Base
|
441
|
+
# process ".dtstart", :dtstart=>["abbr@title", :text]
|
442
|
+
# process ".dtend", :dtend=>["abbr@title", :text]
|
443
|
+
# result :dtstart, :dtend
|
444
|
+
# end
|
445
|
+
#
|
446
|
+
# dts = ScrapeDts.scrape(html)
|
447
|
+
# puts "Starts: #{dts.dtstart}"
|
448
|
+
# puts "Ends: #{dts.dtend}"
|
449
|
+
def result(*symbols)
|
450
|
+
raise ArgumentError, "Use one symbol to return the value of this accessor, multiple symbols to returns a structure" if symbols.empty?
|
451
|
+
symbols = symbols.map {|s| s.to_sym}
|
452
|
+
if symbols.size == 1
|
453
|
+
define_method :result do
|
454
|
+
return self.send(symbols[0])
|
455
|
+
end
|
456
|
+
else
|
457
|
+
struct = Struct.new(*symbols)
|
458
|
+
define_method :result do
|
459
|
+
return struct.new(*symbols.collect {|s| self.send(s) })
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
|
464
|
+
|
465
|
+
# Declares which accessors are arrays. You can declare the
|
466
|
+
# accessor here, or use "symbol[]" as the target.
|
467
|
+
#
|
468
|
+
# For example:
|
469
|
+
# array :urls
|
470
|
+
# process "a[href]", :urls=>"@href"
|
471
|
+
# Is equivalent to:
|
472
|
+
# process "a[href]", "urls[]"=>"@href"
|
473
|
+
def array(*symbols)
|
474
|
+
@arrays ||= []
|
475
|
+
symbols.each do |symbol|
|
476
|
+
symbol = symbol.to_sym
|
477
|
+
@arrays << symbol
|
478
|
+
begin
|
479
|
+
self.instance_method(symbol)
|
480
|
+
rescue NameError
|
481
|
+
attr_accessor symbol
|
482
|
+
end
|
483
|
+
end
|
484
|
+
end
|
485
|
+
|
486
|
+
|
487
|
+
private
|
488
|
+
|
489
|
+
|
490
|
+
# Called by #process and #process_first, see there for
|
491
|
+
# documentation. First argument indicates whether to
|
492
|
+
# process only the first matching element (+true+) or
|
493
|
+
# all matching elements (+false+).
|
494
|
+
def create_process(first, *selector, &block)
|
495
|
+
# First argument may be the rule name.
|
496
|
+
name = selector.shift if selector.first.is_a?(Symbol)
|
497
|
+
# Extractor is either a block, last argument or both.
|
498
|
+
if selector.last.is_a?(Proc)
|
499
|
+
extractor = selector.pop
|
500
|
+
elsif selector.last.is_a?(Hash)
|
501
|
+
extractor = extractor(selector.pop)
|
502
|
+
end
|
503
|
+
if block && extractor
|
504
|
+
# Ugly, but no other way to chain two calls bound to the
|
505
|
+
# scraper instance.
|
506
|
+
define_method :__extractor, extractor
|
507
|
+
extractor1 = instance_method(:__extractor)
|
508
|
+
define_method :__extractor, block
|
509
|
+
extractor2 = instance_method(:__extractor)
|
510
|
+
remove_method :__extractor
|
511
|
+
extractor = lambda do |element|
|
512
|
+
extractor1.bind(self).call(element)
|
513
|
+
extractor2.bind(self).call(element)
|
514
|
+
end
|
515
|
+
elsif block
|
516
|
+
extractor = block
|
517
|
+
end
|
518
|
+
raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
|
519
|
+
# And if we think the extractor is the last argument,
|
520
|
+
# it's certainly not the selector.
|
521
|
+
raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
|
522
|
+
if selector[0].is_a?(String)
|
523
|
+
selector = HTML::Selector.new(*selector)
|
524
|
+
else
|
525
|
+
raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
|
526
|
+
selector = selector[0]
|
527
|
+
end
|
528
|
+
# Create a method for fast evaluation.
|
529
|
+
define_method :__extractor, extractor
|
530
|
+
method = instance_method(:__extractor)
|
531
|
+
remove_method :__extractor
|
532
|
+
# Decide where to put the rule.
|
533
|
+
pos = rules.length
|
534
|
+
if name
|
535
|
+
if find = rules.find {|rule| rule[2] == name }
|
536
|
+
find[0] = selector
|
537
|
+
find[1] = method
|
538
|
+
else
|
539
|
+
rules << [selector, method, name, first]
|
540
|
+
end
|
541
|
+
else
|
542
|
+
rules << [selector, method, name, first]
|
543
|
+
end
|
544
|
+
end
|
545
|
+
|
546
|
+
|
547
|
+
# Returns a Proc that will extract a value from an element.
|
548
|
+
#
|
549
|
+
# The +source+ argument specifies which value to extract.
|
550
|
+
# See #extractor for more details.
|
551
|
+
#
|
552
|
+
# The Proc is called with an element and returns a value
|
553
|
+
# or +nil+.
|
554
|
+
def extract_value_from(source)
|
555
|
+
case source
|
556
|
+
when Array
|
557
|
+
# For an array, each item is itself a source argument.
|
558
|
+
# We stop at the first value we're able to extract.
|
559
|
+
array = source.collect { |i| extract_value_from(i) }
|
560
|
+
return lambda do |element|
|
561
|
+
result = nil
|
562
|
+
array.each { |proc| break if result = proc.call(element) }
|
563
|
+
result
|
564
|
+
end
|
565
|
+
when Hash
|
566
|
+
# For a hash, each pair is a symbol and source argument.
|
567
|
+
# We extract all the values and set them in the hash.
|
568
|
+
hash = source.inject({}) { |h,p| h[p[0]] = extract_value_from(p[1]) ; h }
|
569
|
+
return lambda do |element|
|
570
|
+
result = {}
|
571
|
+
hash.each_pair do |source, target|
|
572
|
+
if value = target.call(element)
|
573
|
+
result[source] = value
|
574
|
+
end
|
575
|
+
end
|
576
|
+
result unless result.empty?
|
577
|
+
end
|
578
|
+
when Class
|
579
|
+
# A class is a scraper we run on the extracted element.
|
580
|
+
# It must extend Scraper::Base.
|
581
|
+
klass = source
|
582
|
+
while klass = klass.superclass
|
583
|
+
break if klass == Scraper::Base
|
584
|
+
end
|
585
|
+
raise ArgumentError, "Class must be a scraper that extends Scraper::Base" unless klass
|
586
|
+
return lambda { |element| source.new(element).scrape }
|
587
|
+
when Symbol
|
588
|
+
# A symbol is a method we call. We pass it the element
|
589
|
+
# and it returns the extracted value. It must be a class method.
|
590
|
+
method = method(source) rescue
|
591
|
+
raise(ArgumentError, "No method #{source} in #{self.class}")
|
592
|
+
return lambda { |element| method.call(element) }
|
593
|
+
when Proc, Method
|
594
|
+
# Self evident.
|
595
|
+
raise ArgumentError, "Proc or Method must take one argument (an element)" if source.arity == 0
|
596
|
+
return source
|
597
|
+
when /^[\w\-:]+$/
|
598
|
+
# An element name. Return the element if the name matches.
|
599
|
+
return lambda { |element| element if element.name == source }
|
600
|
+
when /^@[\w\-:]+$/
|
601
|
+
# An attribute name. Return its value if the attribute is specified.
|
602
|
+
attr_name = source[1..-1]
|
603
|
+
return lambda { |element| element.attributes[attr_name] }
|
604
|
+
when /^[\w\-:]+@[\w\-:]+$/
|
605
|
+
# An element with attribute name. Return the attribute value if
|
606
|
+
# the attribute is specified, and the element name matches.
|
607
|
+
tag_name, attr_name = source.match(/^([\w\-:]+)@([\w\-:]+)$/)[1..2]
|
608
|
+
return lambda do |element|
|
609
|
+
element.attributes[attr_name] if element.name == tag_name
|
610
|
+
end
|
611
|
+
else
|
612
|
+
return lambda { |element| source }
|
613
|
+
# Anything else and pianos fall from the sky.
|
614
|
+
raise ArgumentError, "Invalid extractor #{source.to_s}"
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
|
619
|
+
# Returns a Proc that will set the extract value in the object.
|
620
|
+
#
|
621
|
+
# The +target+ argument identifies an instance variable. It may
|
622
|
+
# be the name of a variable, or the name of a variable prefixed
|
623
|
+
# with [] to denote an array.
|
624
|
+
#
|
625
|
+
# The Proc is called with two arguments: the object to set the
|
626
|
+
# value in, and the value.
|
627
|
+
def extract_value_to(target)
|
628
|
+
if target.is_a?(Array)
|
629
|
+
setters = target.collect do |target|
|
630
|
+
[target,extract_value_to(target)]
|
631
|
+
end
|
632
|
+
return lambda do |object,value|
|
633
|
+
setters.each do |setter|
|
634
|
+
setter[1].call(object, value.send(setter[0]))
|
635
|
+
end
|
636
|
+
end
|
637
|
+
end
|
638
|
+
|
639
|
+
if target.to_sym == :skip
|
640
|
+
return lambda do |object, value|
|
641
|
+
object.send(:skip, value)
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
target = target.to_s
|
646
|
+
if target[-2..-1] == "[]" or (@arrays && array = @arrays.include?(target.to_sym))
|
647
|
+
target = target[0...-2] unless array
|
648
|
+
# Create an attribute accessor is not already defined.
|
649
|
+
begin
|
650
|
+
self.instance_method(target)
|
651
|
+
rescue NameError
|
652
|
+
attr_accessor target
|
653
|
+
end
|
654
|
+
reader = "#{target}".to_sym
|
655
|
+
writer = "#{target}=".to_sym
|
656
|
+
return lambda do |object, value|
|
657
|
+
array = object.send(reader)
|
658
|
+
object.send(writer, array = []) unless array
|
659
|
+
array << value
|
660
|
+
end
|
661
|
+
else
|
662
|
+
# Create an attribute accessor is not already defined.
|
663
|
+
begin
|
664
|
+
self.instance_method(target)
|
665
|
+
rescue NameError
|
666
|
+
instance = "@#{target}".to_sym
|
667
|
+
attr_accessor target
|
668
|
+
end
|
669
|
+
reader = "#{target}=".to_sym
|
670
|
+
return lambda { |object, value| object.send(reader, value) }
|
671
|
+
end
|
672
|
+
end
|
673
|
+
|
674
|
+
|
675
|
+
def inherited(child)
|
676
|
+
super
|
677
|
+
# Duplicate options, rules and arrays rules to any inherited class.
|
678
|
+
child.options.update self.options
|
679
|
+
child.rules.concat self.rules
|
680
|
+
child.instance_variable_set :@arrays, self.instance_variable_get(:@arrays)
|
681
|
+
end
|
682
|
+
|
683
|
+
end
|
684
|
+
|
685
|
+
|
686
|
+
unless const_defined? :READER_OPTIONS
|
687
|
+
READER_OPTIONS = [:last_modified, :etag, :redirect_limit, :user_agent, :timeout]
|
688
|
+
end
|
689
|
+
|
690
|
+
|
691
|
+
# Set to true when the first extractor returns true.
|
692
|
+
attr_accessor :extracted
|
693
|
+
|
694
|
+
|
695
|
+
# Information about the HTML page scraped. See PageInfo.
|
696
|
+
attr_accessor :page_info
|
697
|
+
|
698
|
+
|
699
|
+
# Returns the options for this object.
|
700
|
+
attr_accessor :options
|
701
|
+
|
702
|
+
|
703
|
+
# Create a new scraper instance.
|
704
|
+
#
|
705
|
+
# The argument +source+ is a URL, string containing HTML, or HTML::Node.
|
706
|
+
# The optional argument +options+ are options passed to the scraper.
|
707
|
+
# See Base#scrape for more details.
|
708
|
+
#
|
709
|
+
# For example:
|
710
|
+
# # The page we want to scrape
|
711
|
+
# url = URI.parse("http://example.com")
|
712
|
+
# # Skip the header
|
713
|
+
# scraper = MyScraper.new(url, :root_element=>"body")
|
714
|
+
# result = scraper.scrape
|
715
|
+
def initialize(source, options = nil)
|
716
|
+
@page_info = PageInfo[]
|
717
|
+
@options = options || {}
|
718
|
+
case source
|
719
|
+
when URI
|
720
|
+
@document = source
|
721
|
+
when String, HTML::Node
|
722
|
+
@document = source
|
723
|
+
# TODO: document and test case these two.
|
724
|
+
@page_info.url = @page_info.original_url = @options[:url]
|
725
|
+
@page_info.encoding = @options[:encoding]
|
726
|
+
else
|
727
|
+
raise ArgumentError, "Can only scrape URI, String or HTML::Node"
|
728
|
+
end
|
729
|
+
end
|
730
|
+
|
731
|
+
|
732
|
+
# Scrapes the document and returns the result.
|
733
|
+
#
|
734
|
+
# If the scraper was created with a URL, retrieve the page and parse it.
|
735
|
+
# If the scraper was created with a string, parse the page.
|
736
|
+
#
|
737
|
+
# The result is returned by calling the #result method. The default
|
738
|
+
# implementation returns +self+ if any extractor returned true,
|
739
|
+
# +nil+ otherwise.
|
740
|
+
#
|
741
|
+
# The method may raise any number of exceptions. HTTPError indicates
|
742
|
+
# it failed to retrieve the HTML page, and HTMLParseError that it failed
|
743
|
+
# to parse the page. Other exceptions come from extractors and the
|
744
|
+
# #result method.
|
745
|
+
#
|
746
|
+
# See also Base#scrape.
|
747
|
+
def scrape()
|
748
|
+
# Call prepare with the document, but before doing anything else.
|
749
|
+
prepare document
|
750
|
+
# Retrieve the document. This may raise HTTPError or HTMLParseError.
|
751
|
+
case document
|
752
|
+
when Array
|
753
|
+
stack = @document.reverse # see below
|
754
|
+
when HTML::Node
|
755
|
+
# If a root element is specified, start selecting from there.
|
756
|
+
# The stack is empty if we can't find any root element (makes
|
757
|
+
# sense). However, the node we're going to process may be
|
758
|
+
# a tag, or an HTML::Document.root which is the equivalent of
|
759
|
+
# a document fragment.
|
760
|
+
root_element = option(:root_element)
|
761
|
+
root = root_element ? @document.find(:tag=>root_element) : @document
|
762
|
+
stack = root ? (root.tag? ? [root] : root.children.reverse) : []
|
763
|
+
else
|
764
|
+
return
|
765
|
+
end
|
766
|
+
# @skip stores all the elements we want to skip (see #skip).
|
767
|
+
# rules stores all the rules we want to process with this
|
768
|
+
# scraper, based on the class definition.
|
769
|
+
@skip = []
|
770
|
+
@stop = false
|
771
|
+
rules = self.class.rules.clone
|
772
|
+
begin
|
773
|
+
# Process the document one node at a time. We process elements
|
774
|
+
# from the end of the stack, so each time we visit child elements,
|
775
|
+
# we add them to the end of the stack in reverse order.
|
776
|
+
while node = stack.pop
|
777
|
+
break if @stop
|
778
|
+
skip_this = false
|
779
|
+
# Only match nodes that are elements, ignore text nodes.
|
780
|
+
# Also ignore any element that's on the skip list, and if
|
781
|
+
# found one, remove it from the list (since we never visit
|
782
|
+
# the same element twice). But an element may be added twice
|
783
|
+
# to the skip list.
|
784
|
+
# Note: equal? is faster than == for nodes.
|
785
|
+
next unless node.tag?
|
786
|
+
@skip.delete_if { |s| skip_this = true if s.equal?(node) }
|
787
|
+
next if skip_this
|
788
|
+
|
789
|
+
# Run through all the rules until we process the element or
|
790
|
+
# run out of rules. If skip_this=true then we processed the
|
791
|
+
# element and we can break out of the loop. However, we might
|
792
|
+
# process (and skip) descedants so also watch the skip list.
|
793
|
+
rules.delete_if do |selector, extractor, rule_name, first_only|
|
794
|
+
break if skip_this
|
795
|
+
# The result of calling match (selected) is nil, element
|
796
|
+
# or array of elements. We turn it into an array to
|
797
|
+
# process one element at a time. We process all elements
|
798
|
+
# that are not on the skip list (we haven't visited
|
799
|
+
# them yet).
|
800
|
+
if selected = selector.match(node, first_only)
|
801
|
+
selected = [selected] unless selected.is_a?(Array)
|
802
|
+
selected = [selected.first] if first_only
|
803
|
+
selected.each do |element|
|
804
|
+
# Do not process elements we already skipped
|
805
|
+
# (see above). However, this time we may visit
|
806
|
+
# an element twice, since selected elements may
|
807
|
+
# be descendants of the current element on the
|
808
|
+
# stack. In rare cases two elements on the stack
|
809
|
+
# may pick the same descendants.
|
810
|
+
next if @skip.find { |s| s.equal?(element) }
|
811
|
+
# Call the extractor method with this element.
|
812
|
+
# If it returns true, skip the element and if
|
813
|
+
# the current element, don't process any more
|
814
|
+
# rules. Again, pay attention to descendants.
|
815
|
+
if extractor.bind(self).call(element)
|
816
|
+
@extracted = true
|
817
|
+
end
|
818
|
+
if @skip.delete(true)
|
819
|
+
if element.equal?(node)
|
820
|
+
skip_this = true
|
821
|
+
else
|
822
|
+
@skip << element
|
823
|
+
end
|
824
|
+
end
|
825
|
+
end
|
826
|
+
first_only if !selected.empty?
|
827
|
+
end
|
828
|
+
end
|
829
|
+
|
830
|
+
# If we did not skip the element, we're going to process its
|
831
|
+
# children. Reverse order since we're popping from the stack.
|
832
|
+
if !skip_this && children = node.children
|
833
|
+
stack.concat children.reverse
|
834
|
+
end
|
835
|
+
end
|
836
|
+
ensure
|
837
|
+
@skip = nil
|
838
|
+
end
|
839
|
+
collect
|
840
|
+
return result
|
841
|
+
end
|
842
|
+
|
843
|
+
|
844
|
+
# Returns the document being processed.
|
845
|
+
#
|
846
|
+
# If the scraper was created with a URL, this method will attempt to
|
847
|
+
# retrieve the page and parse it.
|
848
|
+
#
|
849
|
+
# If the scraper was created with a string, this method will attempt
|
850
|
+
# to parse the page.
|
851
|
+
#
|
852
|
+
# Be advised that calling this method may raise an exception
|
853
|
+
# (HTTPError or HTMLParseError).
|
854
|
+
#
|
855
|
+
# The document is parsed only the first time this method is called.
|
856
|
+
def document
|
857
|
+
if @document.is_a?(URI)
|
858
|
+
# Attempt to read page. May raise HTTPError.
|
859
|
+
options = {}
|
860
|
+
READER_OPTIONS.each { |key| options[key] = option(key) }
|
861
|
+
request(@document, options)
|
862
|
+
end
|
863
|
+
if @document.is_a?(String)
|
864
|
+
# Parse the page. May raise HTMLParseError.
|
865
|
+
parsed = Reader.parse_page(@document, @page_info.encoding,
|
866
|
+
option(:parser_options), option(:parser))
|
867
|
+
@document = parsed.document
|
868
|
+
@page_info.encoding = parsed.encoding
|
869
|
+
end
|
870
|
+
return @document if @document.is_a?(HTML::Node)
|
871
|
+
raise RuntimeError, "No document to process"
|
872
|
+
end
|
873
|
+
|
874
|
+
|
875
|
+
def request(url, options)
|
876
|
+
if page = Reader.read_page(@document, options)
|
877
|
+
@page_info.url = page.url
|
878
|
+
@page_info.original_url = @document
|
879
|
+
@page_info.last_modified = page.last_modified
|
880
|
+
@page_info.etag = page.etag
|
881
|
+
@page_info.encoding = page.encoding
|
882
|
+
@document = page.content
|
883
|
+
end
|
884
|
+
end
|
885
|
+
|
886
|
+
|
887
|
+
# :call-seq:
|
888
|
+
# skip() => true
|
889
|
+
# skip(element) => true
|
890
|
+
# skip([element ...]) => true
|
891
|
+
#
|
892
|
+
# Skips processing the specified element(s).
|
893
|
+
#
|
894
|
+
# If called with a single element, that element will not be processed.
|
895
|
+
#
|
896
|
+
# If called with an array of elements, all the elements in the array
|
897
|
+
# are skipped.
|
898
|
+
#
|
899
|
+
# If called with no element, skips processing the current element.
|
900
|
+
# This has the same effect as returning true.
|
901
|
+
#
|
902
|
+
# For convenience this method always returns true. For example:
|
903
|
+
# process "h1" do |element|
|
904
|
+
# @header = element
|
905
|
+
# skip
|
906
|
+
# end
|
907
|
+
def skip(elements = nil)
|
908
|
+
case elements
|
909
|
+
when Array: @skip.concat elements
|
910
|
+
when HTML::Node: @skip << elements
|
911
|
+
when nil: @skip << true
|
912
|
+
when true, false: @skip << elements
|
913
|
+
end
|
914
|
+
# Calling skip(element) as the last statement is
|
915
|
+
# redundant by design.
|
916
|
+
return true
|
917
|
+
end
|
918
|
+
|
919
|
+
|
920
|
+
# Stops processing this page. You can call this early on if you
|
921
|
+
# discover there is no interesting information on the page, or
|
922
|
+
# done extracting all useful information.
|
923
|
+
def stop()
|
924
|
+
@stop = true
|
925
|
+
end
|
926
|
+
|
927
|
+
|
928
|
+
# Called by #scrape after creating the document, but before running
|
929
|
+
# any processing rules.
|
930
|
+
#
|
931
|
+
# You can override this method to do any preparation work.
|
932
|
+
def prepare(document)
|
933
|
+
end
|
934
|
+
|
935
|
+
|
936
|
+
# Called by #scrape scraping the document, and before calling #result.
|
937
|
+
# Typically used to run any validation, post-processing steps,
|
938
|
+
# resolving referenced elements, etc.
|
939
|
+
def collect()
|
940
|
+
end
|
941
|
+
|
942
|
+
|
943
|
+
# Returns the result of a succcessful scrape.
|
944
|
+
#
|
945
|
+
# This method is called by #scrape after running all the rules on the
|
946
|
+
# document. You can also call it directly.
|
947
|
+
#
|
948
|
+
# Override this method to return a specific object, perform post-scraping
|
949
|
+
# processing, validation, etc.
|
950
|
+
#
|
951
|
+
# The default implementation returns +self+ if any extractor returned
|
952
|
+
# true, +nil+ otherwise.
|
953
|
+
#
|
954
|
+
# If you override this method, implement your own logic to determine
|
955
|
+
# if anything was extracted and return +nil+ otherwise. Also, make sure
|
956
|
+
# calling this method multiple times returns the same result.
|
957
|
+
def result()
|
958
|
+
return self if @extracted
|
959
|
+
end
|
960
|
+
|
961
|
+
|
962
|
+
# Returns the value of an option.
|
963
|
+
#
|
964
|
+
# Returns the value of an option passed to the scraper on creation.
|
965
|
+
# If not specified, return the value of the option set for this
|
966
|
+
# scraper class. Options are inherited from the parent class.
|
967
|
+
def option(symbol)
|
968
|
+
return options.has_key?(symbol) ? options[symbol] : self.class.options[symbol]
|
969
|
+
end
|
970
|
+
|
971
|
+
|
972
|
+
end
|
973
|
+
|
974
|
+
|
975
|
+
# Define an anonymous scraper and returns the class.
|
976
|
+
#
|
977
|
+
# For example:
|
978
|
+
# links = Scraper.define do
|
979
|
+
# process "a[href]", :urls=>"@href"
|
980
|
+
# result :urls
|
981
|
+
# end
|
982
|
+
#
|
983
|
+
# puts links.scrape(html)
|
984
|
+
def self.define(&block)
|
985
|
+
kls = Class.new(Scraper::Base)
|
986
|
+
kls.module_eval &block
|
987
|
+
return kls
|
988
|
+
end
|
989
|
+
|
990
|
+
end
|