scrapi 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +22 -0
- data/MIT-LICENSE +20 -0
- data/README +88 -0
- data/Rakefile +67 -0
- data/lib/html/document.rb +64 -0
- data/lib/html/htmlparser.rb +407 -0
- data/lib/html/node.rb +534 -0
- data/lib/html/node_ext.rb +86 -0
- data/lib/html/selector.rb +825 -0
- data/lib/html/tokenizer.rb +105 -0
- data/lib/html/version.rb +11 -0
- data/lib/scraper/base.rb +970 -0
- data/lib/scraper/reader.rb +239 -0
- data/lib/scrapi.rb +8 -0
- data/lib/tidy/libtidy.dll +0 -0
- data/lib/tidy/libtidy.so +0 -0
- data/test/mock_net_http.rb +54 -0
- data/test/node_ext_test.rb +24 -0
- data/test/reader_test.rb +299 -0
- data/test/scraper_test.rb +798 -0
- data/test/selector_test.rb +637 -0
- metadata +81 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module HTML #:nodoc:
|
4
|
+
|
5
|
+
# A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
|
6
|
+
# token is a string. Each string represents either "text", or an HTML element.
|
7
|
+
#
|
8
|
+
# This currently assumes valid XHTML, which means no free < or > characters.
|
9
|
+
#
|
10
|
+
# Usage:
|
11
|
+
#
|
12
|
+
# tokenizer = HTML::Tokenizer.new(text)
|
13
|
+
# while token = tokenizer.next
|
14
|
+
# p token
|
15
|
+
# end
|
16
|
+
class Tokenizer #:nodoc:
|
17
|
+
|
18
|
+
# The current (byte) position in the text
|
19
|
+
attr_reader :position
|
20
|
+
|
21
|
+
# The current line number
|
22
|
+
attr_reader :line
|
23
|
+
|
24
|
+
# Create a new Tokenizer for the given text.
|
25
|
+
def initialize(text)
|
26
|
+
@scanner = StringScanner.new(text)
|
27
|
+
@position = 0
|
28
|
+
@line = 0
|
29
|
+
@current_line = 1
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return the next token in the sequence, or +nil+ if there are no more tokens in
|
33
|
+
# the stream.
|
34
|
+
def next
|
35
|
+
return nil if @scanner.eos?
|
36
|
+
@position = @scanner.pos
|
37
|
+
@line = @current_line
|
38
|
+
if @scanner.check(/<\S/)
|
39
|
+
update_current_line(scan_tag)
|
40
|
+
else
|
41
|
+
update_current_line(scan_text)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# Treat the text at the current position as a tag, and scan it. Supports
|
48
|
+
# comments, doctype tags, and regular tags, and ignores less-than and
|
49
|
+
# greater-than characters within quoted strings.
|
50
|
+
def scan_tag
|
51
|
+
tag = @scanner.getch
|
52
|
+
if @scanner.scan(/!--/) # comment
|
53
|
+
tag << @scanner.matched
|
54
|
+
tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
|
55
|
+
elsif @scanner.scan(/!\[CDATA\[/)
|
56
|
+
tag << @scanner.matched
|
57
|
+
tag << @scanner.scan_until(/\]\]>/)
|
58
|
+
elsif @scanner.scan(/!/) # doctype
|
59
|
+
tag << @scanner.matched
|
60
|
+
tag << consume_quoted_regions
|
61
|
+
else
|
62
|
+
tag << consume_quoted_regions
|
63
|
+
end
|
64
|
+
tag
|
65
|
+
end
|
66
|
+
|
67
|
+
# Scan all text up to the next < character and return it.
|
68
|
+
def scan_text
|
69
|
+
"#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Counts the number of newlines in the text and updates the current line
|
73
|
+
# accordingly.
|
74
|
+
def update_current_line(text)
|
75
|
+
text.scan(/\r?\n/) { @current_line += 1 }
|
76
|
+
end
|
77
|
+
|
78
|
+
# Skips over quoted strings, so that less-than and greater-than characters
|
79
|
+
# within the strings are ignored.
|
80
|
+
def consume_quoted_regions
|
81
|
+
text = ""
|
82
|
+
loop do
|
83
|
+
match = @scanner.scan_until(/['"<>]/) or break
|
84
|
+
|
85
|
+
delim = @scanner.matched
|
86
|
+
if delim == "<"
|
87
|
+
match = match.chop
|
88
|
+
@scanner.pos -= 1
|
89
|
+
end
|
90
|
+
|
91
|
+
text << match
|
92
|
+
break if delim == "<" || delim == ">"
|
93
|
+
|
94
|
+
# consume the quoted region
|
95
|
+
while match = @scanner.scan_until(/[\\#{delim}]/)
|
96
|
+
text << match
|
97
|
+
break if @scanner.matched == delim
|
98
|
+
text << @scanner.getch # skip the escaped character
|
99
|
+
end
|
100
|
+
end
|
101
|
+
text
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/html/version.rb
ADDED
data/lib/scraper/base.rb
ADDED
@@ -0,0 +1,970 @@
|
|
1
|
+
# ScrAPI toolkit for Ruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
|
4
|
+
# Developed for http://co.mments.com
|
5
|
+
# Code and documention: http://labnotes.org
|
6
|
+
|
7
|
+
|
8
|
+
require "rubygems"
|
9
|
+
require File.join(File.dirname(__FILE__), "reader")
|
10
|
+
|
11
|
+
|
12
|
+
module Scraper
|
13
|
+
|
14
|
+
class Base
|
15
|
+
|
16
|
+
|
17
|
+
# Information about the HTML page scraped. A structure with the following
|
18
|
+
# attributes:
|
19
|
+
# * <tt>url</tt> -- The URL of the document being scraped. Passed in
|
20
|
+
# the constructor but may have changed if the page was redirected.
|
21
|
+
# * <tt>original_url</tt> -- The original URL of the document being
|
22
|
+
# scraped as passed in the constructor.
|
23
|
+
# * <tt>encoding</tt> -- The encoding of the document.
|
24
|
+
# * <tt>last_modified</tt> -- Value of the Last-Modified header returned
|
25
|
+
# from the server.
|
26
|
+
# * <tt>etag</tt> -- Value of the Etag header returned from the server.
|
27
|
+
PageInfo = Struct.new(:url, :original_url, :encoding, :last_modified, :etag)
|
28
|
+
|
29
|
+
|
30
|
+
class << self
|
31
|
+
|
32
|
+
# :call-seq:
|
33
|
+
# process(symbol?, selector, values?, extractor)
|
34
|
+
# process(symbol?, selector, values?) { |element| ... }
|
35
|
+
#
|
36
|
+
# Defines a processing rule. A processing rule consists of a selector
|
37
|
+
# that matches element, and an extractor that does something interesting
|
38
|
+
# with their value.
|
39
|
+
#
|
40
|
+
# == Symbol
|
41
|
+
#
|
42
|
+
# Rules are processed in the order in which they are defined. Use #rules
|
43
|
+
# if you need to change the order of processing.
|
44
|
+
#
|
45
|
+
# Rules can be named or anonymous. If the first argument is a symbol,
|
46
|
+
# it is used as the rule name. You can use the rule name to position,
|
47
|
+
# remove or replace it.
|
48
|
+
#
|
49
|
+
# == Selector
|
50
|
+
#
|
51
|
+
# The first argument is a selector. It selects elements from the document
|
52
|
+
# that are potential candidates for extraction. Each selected element is
|
53
|
+
# passed to the extractor.
|
54
|
+
#
|
55
|
+
# The +selector+ argument may be a string, an HTML::Selector object or
|
56
|
+
# any object that responds to the +select+ method. Passing an Array
|
57
|
+
# (responds to +select+) will not do anything useful.
|
58
|
+
#
|
59
|
+
# String selectors support value substitution, replacing question marks
|
60
|
+
# (?) in the selector expression with values from the method arguments.
|
61
|
+
# See HTML::Selector for more information.
|
62
|
+
#
|
63
|
+
# == Extractor
|
64
|
+
#
|
65
|
+
# The last argument or block is the extractor. The extractor does
|
66
|
+
# something interested with the selected element, typically assigns
|
67
|
+
# it to an instance variable of the scraper.
|
68
|
+
#
|
69
|
+
# Since the extractor is called on the scraper, it can also use the
|
70
|
+
# scraper to maintain state, e.g. this extractor counts how many
|
71
|
+
# +div+ elements appear in the document:
|
72
|
+
# process "div" { |element| @count += 1 }
|
73
|
+
#
|
74
|
+
# The extractor returns +true+ if the element was processed and
|
75
|
+
# should not be passed to any other extractor (including any child
|
76
|
+
# elements).
|
77
|
+
#
|
78
|
+
# The default implementation of #result returns +self+ only if at
|
79
|
+
# least one extractor returned +true+. However, you can override
|
80
|
+
# #result and use extractors that return +false+.
|
81
|
+
#
|
82
|
+
# A block extractor is called with a single element.
|
83
|
+
#
|
84
|
+
# You can also use the #extractor method to create extractors that
|
85
|
+
# assign elements, attributes and text values to instance variables,
|
86
|
+
# or pass a +Hash+ as the last argument to #process. See #extractor
|
87
|
+
# for more information.
|
88
|
+
#
|
89
|
+
# When using a block, the last statement is the response. Do not use
|
90
|
+
# +return+, use +next+ if you want to return a value before the last
|
91
|
+
# statement. +return+ does not do what you expect it to.
|
92
|
+
#
|
93
|
+
# == Example
|
94
|
+
#
|
95
|
+
# class ScrapePosts < Scraper::Base
|
96
|
+
# # Select the title of a post
|
97
|
+
# selector :select_title, "h2"
|
98
|
+
#
|
99
|
+
# # Select the body of a post
|
100
|
+
# selector :select_body, ".body"
|
101
|
+
#
|
102
|
+
# # All elements with class name post.
|
103
|
+
# process ".post" do |element|
|
104
|
+
# title = select_title(element)
|
105
|
+
# body = select_body(element)
|
106
|
+
# @posts << Post.new(title, body)
|
107
|
+
# true
|
108
|
+
# end
|
109
|
+
#
|
110
|
+
# attr_reader :posts
|
111
|
+
# end
|
112
|
+
#
|
113
|
+
# posts = ScrapePosts.scrape(html).posts
|
114
|
+
#
|
115
|
+
# To process only a single element:
|
116
|
+
#
|
117
|
+
# class ScrapeTitle < Scraper::Base
|
118
|
+
# process "html>head>title", :title=>text
|
119
|
+
# result :title
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
# puts ScrapeTitle.scrape(html)
|
123
|
+
def process(*selector, &block)
|
124
|
+
create_process(false, *selector, &block)
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
# Similar to #process, but only extracts from the first
|
129
|
+
# selected element. Faster if you know the document contains
|
130
|
+
# only one applicable element, or only interested in processing
|
131
|
+
# the first one.
|
132
|
+
def process_first(*selector, &block)
|
133
|
+
create_process(true, *selector, &block)
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# :call-seq:
|
138
|
+
# selector(symbol, selector, values?)
|
139
|
+
# selector(symbol, selector, values?) { |elements| ... }
|
140
|
+
#
|
141
|
+
# Create a selector method. You can call a selector method directly
|
142
|
+
# to select elements.
|
143
|
+
#
|
144
|
+
# For example, define a selector:
|
145
|
+
# selector :five_divs, "div" { |elems| elems[0..4] }
|
146
|
+
# And call it to retrieve the first five +div+ elements:
|
147
|
+
# divs = five_divs(element)
|
148
|
+
#
|
149
|
+
# Call a selector method with an element and it returns an array of
|
150
|
+
# elements that match the selector, beginning with the element argument
|
151
|
+
# itself. It returns an empty array if nothing matches.
|
152
|
+
#
|
153
|
+
# If the selector is defined with a block, all selected elements are
|
154
|
+
# passed to the block and the result of the block is returned.
|
155
|
+
#
|
156
|
+
# For convenience, a <tt>first_</tt> method is also created that
|
157
|
+
# returns (and yields) only the first selected element. For example:
|
158
|
+
# selector :post, "#post"
|
159
|
+
# @post = first_post
|
160
|
+
#
|
161
|
+
# Since the selector is defined with a block, both methods call that
|
162
|
+
# block with an array of elements.
|
163
|
+
#
|
164
|
+
# The +selector+ argument may be a string, an HTML::Selector object or
|
165
|
+
# any object that responds to the +select+ method. Passing an Array
|
166
|
+
# (responds to +select+) will not do anything useful.
|
167
|
+
#
|
168
|
+
# String selectors support value substitution, replacing question marks
|
169
|
+
# (?) in the selector expression with values from the method arguments.
|
170
|
+
# See HTML::Selector for more information.
|
171
|
+
#
|
172
|
+
# When using a block, the last statement is the response. Do not use
|
173
|
+
# +return+, use +next+ if you want to return a value before the last
|
174
|
+
# statement. +return+ does not do what you expect it to.
|
175
|
+
def selector(symbol, *selector, &block)
|
176
|
+
raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
|
177
|
+
if selector[0].is_a?(String)
|
178
|
+
selector = HTML::Selector.new(*selector)
|
179
|
+
else
|
180
|
+
raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
|
181
|
+
selector = selector[0]
|
182
|
+
end
|
183
|
+
if block
|
184
|
+
define_method symbol do |element|
|
185
|
+
selected = selector.select(element)
|
186
|
+
return block.call(selected) unless selected.empty?
|
187
|
+
end
|
188
|
+
define_method "first_#{symbol}" do |element|
|
189
|
+
selected = selector.select_first(element)
|
190
|
+
return block.call([selected]) if selected
|
191
|
+
end
|
192
|
+
else
|
193
|
+
define_method symbol do |element|
|
194
|
+
return selector.select(element)
|
195
|
+
end
|
196
|
+
define_method "first_#{symbol}" do |element|
|
197
|
+
return selector.select_first(element)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
# Creates an extractor that will extract values from the selected
|
204
|
+
# element and place them in instance variables of the scraper.
|
205
|
+
# You can pass the result to #process.
|
206
|
+
#
|
207
|
+
# == Example
|
208
|
+
#
|
209
|
+
# This example processes a document looking for an element with the
|
210
|
+
# class name +article+. It extracts the attribute +id+ and stores it
|
211
|
+
# in the instance variable +@id+. It extracts the article node itself
|
212
|
+
# and puts it in the instance variable +@article+.
|
213
|
+
#
|
214
|
+
# class ArticleScraper < Scraper::Base
|
215
|
+
# process ".article", extractor(:id=>"@id", :article=>:element)
|
216
|
+
# attr_reader :id, :article
|
217
|
+
# end
|
218
|
+
# result = ArticleScraper.scrape(html)
|
219
|
+
# puts result.id
|
220
|
+
# puts result.article
|
221
|
+
#
|
222
|
+
# == Sources
|
223
|
+
#
|
224
|
+
# Extractors operate on the selected element, and can extract the
|
225
|
+
# following values:
|
226
|
+
# * <tt>"elem_name"</tt> -- Extracts the element itself if it
|
227
|
+
# matches the element name (e.g. "h2" will extract only level 2
|
228
|
+
# header elements).
|
229
|
+
# * <tt>"attr_name"</tt> -- Extracts the attribute value from the
|
230
|
+
# element if specified (e.g. "@id" will extract the id attribute).
|
231
|
+
# * <tt>"elem_name@attr_name"</tt> -- Extracts the attribute value
|
232
|
+
# from the element if specified, but only if the element has the
|
233
|
+
# specified name (e.g. "h2@id").
|
234
|
+
# * <tt>:element</tt> -- Extracts the element itself.
|
235
|
+
# * <tt>:text</tt> -- Extracts the text value of the node.
|
236
|
+
# * <tt>Scraper</tt> -- Using this class creates a scraper to
|
237
|
+
# process the current element and extract the result. This can
|
238
|
+
# be used for handling complex structure.
|
239
|
+
#
|
240
|
+
# If you use an array of sources, the first source that matches
|
241
|
+
# anything is used. For example, <tt>["attr@title", :text]</tt>
|
242
|
+
# extracts the value of the +title+ attribute if the element is
|
243
|
+
# +abbr+, otherwise the text value of the element.
|
244
|
+
#
|
245
|
+
# If you use a hash, you can extract multiple values at the same
|
246
|
+
# time. For example, <tt>{:id=>"@id", :class=>"@class"}</tt>
|
247
|
+
# extracts the +id+ and +class+ attribute values.
|
248
|
+
#
|
249
|
+
# :element and :text are special cases of symbols. You can pass any
|
250
|
+
# symbol that matches a class method and that class method will
|
251
|
+
# be called to extract a value from the selected element.
|
252
|
+
# You can also pass a Proc or Method directly.
|
253
|
+
#
|
254
|
+
# And it's always possible to pass a static value, quite useful for
|
255
|
+
# processing an element with more than one rule (<tt>:skip=>false</tt>).
|
256
|
+
#
|
257
|
+
# == Targets
|
258
|
+
#
|
259
|
+
# Extractors assign the extracted value to an instance variable
|
260
|
+
# of the scraper. The instance variable contains the last value
|
261
|
+
# extracted.
|
262
|
+
#
|
263
|
+
# Also creates an accessor for that instance variable. An accessor
|
264
|
+
# is created if no such method exists. For example,
|
265
|
+
# <tt>:title=>:text</tt> creates an accessor for +title+. However,
|
266
|
+
# <tt>:id=>"@id"</tt> does not create an accessor since each
|
267
|
+
# object already has a method called +id+.
|
268
|
+
#
|
269
|
+
# If you want to extract multiple values into the same variables,
|
270
|
+
# use #array to declare that accessor as an array.
|
271
|
+
#
|
272
|
+
# Alternatively, you can append <tt>[]</tt> to the variable name.
|
273
|
+
# For example:
|
274
|
+
# process "*", "ids[]"=>"@id"
|
275
|
+
# result :ids
|
276
|
+
#
|
277
|
+
# The special target <tt>:skip</tt> allows you to control whether
|
278
|
+
# other rules can apply to the same element. By default a processing
|
279
|
+
# rule without a block (or a block that returns true) will skip
|
280
|
+
# that element so no other processing rule sees it.
|
281
|
+
#
|
282
|
+
# You can change this with <tt>:skip=>false</tt>.
|
283
|
+
def extractor(map)
|
284
|
+
extracts = []
|
285
|
+
map.each_pair do |target, source|
|
286
|
+
source = extract_value_from(source)
|
287
|
+
target = extract_value_to(target)
|
288
|
+
define_method :__extractor do |element|
|
289
|
+
value = source.call(element)
|
290
|
+
target.call(self, value) if !value.nil?
|
291
|
+
end
|
292
|
+
extracts << instance_method(:__extractor)
|
293
|
+
remove_method :__extractor
|
294
|
+
end
|
295
|
+
lambda do |element|
|
296
|
+
extracts.each do |extract|
|
297
|
+
extract.bind(self).call(element)
|
298
|
+
end
|
299
|
+
true
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
|
304
|
+
# Scrapes the document and returns the result.
|
305
|
+
#
|
306
|
+
# The first argument provides the input document. It can be one of:
|
307
|
+
# * <tt>URI</tt> -- Retrieve an HTML page from this URL and
|
308
|
+
# scrape it.
|
309
|
+
# * <tt>String</tt> -- The HTML page as a string.
|
310
|
+
# * <tt>HTML::Node</tt> -- An HTML node, can be a document
|
311
|
+
# or element.
|
312
|
+
#
|
313
|
+
# You can specify options for the scraper class, or override
|
314
|
+
# these by passing options in the second argument. Some options
|
315
|
+
# only make sense in the constructor.
|
316
|
+
#
|
317
|
+
# The following options are supported for reading HTML pages:
|
318
|
+
# * <tt>:last_modified</tt> -- Last-Modified header used for
|
319
|
+
# caching.
|
320
|
+
# * <tt>:etag</tt> -- ETag header used for caching.
|
321
|
+
# * <tt>:redirect_limit</tt> -- Limits number of redirects
|
322
|
+
# to follow.
|
323
|
+
# * <tt>:user_agent</tt> -- Value for User-Agent header.
|
324
|
+
# * <tt>:timeout</tt> -- HTTP open connection/read timeouts
|
325
|
+
# (in second).
|
326
|
+
#
|
327
|
+
# The following options are supported for parsing the HTML:
|
328
|
+
# * <tt>:root_element</tt> -- The root element to scrape, see
|
329
|
+
# also #root_elements.
|
330
|
+
# * <tt>:parser_options</tt> -- Specifies which parser to use.
|
331
|
+
# (Typically, you set this for the class).
|
332
|
+
# * <tt>:parser_options</tt> -- Options to pass to the parser.
|
333
|
+
#
|
334
|
+
# The result is returned by calling the #result method.
|
335
|
+
# The default implementation returns +self+ if any extractor
|
336
|
+
# returned true, +nil+ otherwise.
|
337
|
+
#
|
338
|
+
# For example:
|
339
|
+
# result = MyScraper.scrape(url, :root_element=>"body")
|
340
|
+
#
|
341
|
+
# The method may raise any number of exceptions. HTTPError
|
342
|
+
# indicates it failed to retrieve the HTML page, and HTMLParseError
|
343
|
+
# that it failed to parse the page. Other exceptions come from
|
344
|
+
# extractors and the #result method.
|
345
|
+
def scrape(source, options = nil)
|
346
|
+
scraper = self.new(source, options);
|
347
|
+
return scraper.scrape
|
348
|
+
end
|
349
|
+
|
350
|
+
|
351
|
+
# Returns the text of the element.
|
352
|
+
#
|
353
|
+
# You can use this method from an extractor, e.g.:
|
354
|
+
# process "title", :title=>:text
|
355
|
+
def text(element)
|
356
|
+
text = ""
|
357
|
+
stack = element.children.reverse
|
358
|
+
while node = stack.pop
|
359
|
+
if node.tag?
|
360
|
+
stack.concat node.children.reverse
|
361
|
+
else
|
362
|
+
text << node.content
|
363
|
+
end
|
364
|
+
end
|
365
|
+
return text
|
366
|
+
end
|
367
|
+
|
368
|
+
|
369
|
+
# Returns the element itself.
|
370
|
+
#
|
371
|
+
# You can use this method from an extractor, e.g.:
|
372
|
+
# process "h1", :header=>:element
|
373
|
+
def element(element)
|
374
|
+
element
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
# Specifies which parser to use. The default is +:tidy+.
|
379
|
+
def parser(name = :tidy)
|
380
|
+
self.options[:parser] = name
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
# Options to pass to the parser.
|
385
|
+
#
|
386
|
+
# For example, when using Tidy, you can use these options to
|
387
|
+
# tell Tidy how to clean up the HTML.
|
388
|
+
#
|
389
|
+
# This method sets the option for the class. Classes inherit options
|
390
|
+
# from their parents. You can also pass options to the scraper object
|
391
|
+
# itself using the +:parser_options+ option.
|
392
|
+
def parser_options(options)
|
393
|
+
self.options[:parser_options] = options
|
394
|
+
end
|
395
|
+
|
396
|
+
|
397
|
+
# The root element to scrape.
|
398
|
+
#
|
399
|
+
# The root element for an HTML document is +html+. However, if you want
|
400
|
+
# to scrape only the header or body, you can set the root_element to
|
401
|
+
# +head+ or +body+.
|
402
|
+
#
|
403
|
+
# This method sets the root element for the class. Classes inherit
|
404
|
+
# this option from their parents. You can also pass a root element
|
405
|
+
# to the scraper object itself using the +:root_element+ option.
|
406
|
+
def root_element(name)
|
407
|
+
self.options[:root_element] = name ? name.to_s : nil
|
408
|
+
end
|
409
|
+
|
410
|
+
|
411
|
+
# Returns the options for this class.
|
412
|
+
def options()
|
413
|
+
@options ||= {}
|
414
|
+
end
|
415
|
+
|
416
|
+
|
417
|
+
# Returns an array of rules defined for this class. You can use this
|
418
|
+
# array to change the order of rules.
|
419
|
+
def rules()
|
420
|
+
@rules ||= []
|
421
|
+
end
|
422
|
+
|
423
|
+
|
424
|
+
# Modifies this scraper to return a single value or a structure.
|
425
|
+
# Use in combination with accessors.
|
426
|
+
#
|
427
|
+
# When called with one symbol, scraping returns the result of
|
428
|
+
# calling that method (typically an accessor). When called with
|
429
|
+
# two or more symbols, scraping returns a structure of values,
|
430
|
+
# one for each symbol.
|
431
|
+
#
|
432
|
+
# For example:
|
433
|
+
# class ScrapeTitle < Scraper::Base
|
434
|
+
# process_first "html>head>title", :title=>:text
|
435
|
+
# result :title
|
436
|
+
# end
|
437
|
+
#
|
438
|
+
# puts "Title: " + ScrapeTitle.scrape(html)
|
439
|
+
#
|
440
|
+
# class ScrapeDts < Scraper::Base
|
441
|
+
# process ".dtstart", :dtstart=>["abbr@title", :text]
|
442
|
+
# process ".dtend", :dtend=>["abbr@title", :text]
|
443
|
+
# result :dtstart, :dtend
|
444
|
+
# end
|
445
|
+
#
|
446
|
+
# dts = ScrapeDts.scrape(html)
|
447
|
+
# puts "Starts: #{dts.dtstart}"
|
448
|
+
# puts "Ends: #{dts.dtend}"
|
449
|
+
def result(*symbols)
|
450
|
+
raise ArgumentError, "Use one symbol to return the value of this accessor, multiple symbols to returns a structure" if symbols.empty?
|
451
|
+
symbols = symbols.map {|s| s.to_sym}
|
452
|
+
if symbols.size == 1
|
453
|
+
define_method :result do
|
454
|
+
return self.send(symbols[0])
|
455
|
+
end
|
456
|
+
else
|
457
|
+
struct = Struct.new(*symbols)
|
458
|
+
define_method :result do
|
459
|
+
return struct.new(*symbols.collect {|s| self.send(s) })
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
|
464
|
+
|
465
|
+
# Declares which accessors are arrays. You can declare the
|
466
|
+
# accessor here, or use "symbol[]" as the target.
|
467
|
+
#
|
468
|
+
# For example:
|
469
|
+
# array :urls
|
470
|
+
# process "a[href]", :urls=>"@href"
|
471
|
+
# Is equivalent to:
|
472
|
+
# process "a[href]", "urls[]"=>"@href"
|
473
|
+
def array(*symbols)
|
474
|
+
@arrays ||= []
|
475
|
+
symbols.each { |sym| @arrays << sym.to_sym }
|
476
|
+
end
|
477
|
+
|
478
|
+
|
479
|
+
private
|
480
|
+
|
481
|
+
|
482
|
+
# Called by #process and #process_first, see there for
|
483
|
+
# documentation. First argument indicates whether to
|
484
|
+
# process only the first matching element (+true+) or
|
485
|
+
# all matching elements (+false+).
|
486
|
+
def create_process(first, *selector, &block)
|
487
|
+
# First argument may be the rule name.
|
488
|
+
name = selector.shift if selector.first.is_a?(Symbol)
|
489
|
+
# Extractor is either a block, last argument or both.
|
490
|
+
if selector.last.is_a?(Proc)
|
491
|
+
extractor = selector.pop
|
492
|
+
elsif selector.last.is_a?(Hash)
|
493
|
+
extractor = extractor(selector.pop)
|
494
|
+
end
|
495
|
+
if block && extractor
|
496
|
+
# Ugly, but no other way to chain two calls bound to the
|
497
|
+
# scraper instance.
|
498
|
+
define_method :__extractor, extractor
|
499
|
+
extractor1 = instance_method(:__extractor)
|
500
|
+
define_method :__extractor, block
|
501
|
+
extractor2 = instance_method(:__extractor)
|
502
|
+
remove_method :__extractor
|
503
|
+
extractor = lambda do |element|
|
504
|
+
extractor1.bind(self).call(element)
|
505
|
+
extractor2.bind(self).call(element)
|
506
|
+
end
|
507
|
+
elsif block
|
508
|
+
extractor = block
|
509
|
+
end
|
510
|
+
raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
|
511
|
+
# And if we think the extractor is the last argument,
|
512
|
+
# it's certainly not the selector.
|
513
|
+
raise ArgumentError, "Missing selector: the first argument tells us what to select" if selector.empty?
|
514
|
+
if selector[0].is_a?(String)
|
515
|
+
selector = HTML::Selector.new(*selector)
|
516
|
+
else
|
517
|
+
raise ArgumentError, "Selector must respond to select() method" unless selector.respond_to?(:select)
|
518
|
+
selector = selector[0]
|
519
|
+
end
|
520
|
+
# Create a method for fast evaluation.
|
521
|
+
define_method :__extractor, extractor
|
522
|
+
method = instance_method(:__extractor)
|
523
|
+
remove_method :__extractor
|
524
|
+
# Decide where to put the rule.
|
525
|
+
pos = rules.length
|
526
|
+
if name
|
527
|
+
if find = rules.find {|rule| rule[2] == name }
|
528
|
+
find[0] = selector
|
529
|
+
find[1] = method
|
530
|
+
else
|
531
|
+
rules << [selector, method, name, first]
|
532
|
+
end
|
533
|
+
else
|
534
|
+
rules << [selector, method, name, first]
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
|
539
|
+
# Returns a Proc that will extract a value from an element.
|
540
|
+
#
|
541
|
+
# The +source+ argument specifies which value to extract.
|
542
|
+
# See #extractor for more details.
|
543
|
+
#
|
544
|
+
# The Proc is called with an element and returns a value
|
545
|
+
# or +nil+.
|
546
|
+
def extract_value_from(source)
|
547
|
+
case source
|
548
|
+
when Array
|
549
|
+
# For an array, each item is itself a source argument.
|
550
|
+
# We stop at the first value we're able to extract.
|
551
|
+
array = source.collect { |i| extract_value_from(i) }
|
552
|
+
return lambda do |element|
|
553
|
+
result = nil
|
554
|
+
array.each { |proc| break if result = proc.call(element) }
|
555
|
+
result
|
556
|
+
end
|
557
|
+
when Hash
|
558
|
+
# For a hash, each pair is a symbol and source argument.
|
559
|
+
# We extract all the values and set them in the hash.
|
560
|
+
hash = source.inject({}) { |h,p| h[p[0]] = extract_value_from(p[1]) ; h }
|
561
|
+
return lambda do |element|
|
562
|
+
result = {}
|
563
|
+
hash.each_pair do |source, target|
|
564
|
+
if value = target.call(element)
|
565
|
+
result[source] = value
|
566
|
+
end
|
567
|
+
end
|
568
|
+
result unless result.empty?
|
569
|
+
end
|
570
|
+
when Class
|
571
|
+
# A class is a scraper we run on the extracted element.
|
572
|
+
# It must extend Scraper::Base.
|
573
|
+
klass = source
|
574
|
+
while klass = klass.superclass
|
575
|
+
break if klass == Scraper::Base
|
576
|
+
end
|
577
|
+
raise ArgumentError, "Class must be a scraper that extends Scraper::Base" unless klass
|
578
|
+
return lambda { |element| source.new(element).scrape }
|
579
|
+
when Symbol
|
580
|
+
# A symbol is a method we call. We pass it the element
|
581
|
+
# and it returns the extracted value. It must be a class method.
|
582
|
+
method = method(source) rescue
|
583
|
+
raise(ArgumentError, "No method #{source} in #{self.class}")
|
584
|
+
return lambda { |element| method.call(element) }
|
585
|
+
when Proc, Method
|
586
|
+
# Self evident.
|
587
|
+
raise ArgumentError, "Proc or Method must take one argument (an element)" if source.arity == 0
|
588
|
+
return source
|
589
|
+
when /^[\w\-:]+$/
|
590
|
+
# An element name. Return the element if the name matches.
|
591
|
+
return lambda { |element| element if element.name == source }
|
592
|
+
when /^@[\w\-:]+$/
|
593
|
+
# An attribute name. Return its value if the attribute is specified.
|
594
|
+
attr_name = source[1..-1]
|
595
|
+
return lambda { |element| element.attributes[attr_name] }
|
596
|
+
when /^[\w\-:]+@[\w\-:]+$/
|
597
|
+
# An element with attribute name. Return the attribute value if
|
598
|
+
# the attribute is specified, and the element name matches.
|
599
|
+
tag_name, attr_name = source.match(/^([\w\-:]+)@([\w\-:]+)$/)[1..2]
|
600
|
+
return lambda do |element|
|
601
|
+
element.attributes[attr_name] if element.name == tag_name
|
602
|
+
end
|
603
|
+
else
|
604
|
+
return lambda { |element| source }
|
605
|
+
# Anything else and pianos fall from the sky.
|
606
|
+
raise ArgumentError, "Invalid extractor #{source.to_s}"
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
|
611
|
+
# Returns a Proc that will set the extract value in the object.
|
612
|
+
#
|
613
|
+
# The +target+ argument identifies an instance variable. It may
|
614
|
+
# be the name of a variable, or the name of a variable prefixed
|
615
|
+
# with [] to denote an array.
|
616
|
+
#
|
617
|
+
# The Proc is called with two arguments: the object to set the
|
618
|
+
# value in, and the value.
|
619
|
+
def extract_value_to(target)
|
620
|
+
if target.is_a?(Array)
|
621
|
+
setters = target.collect do |target|
|
622
|
+
[target,extract_value_to(target)]
|
623
|
+
end
|
624
|
+
return lambda do |object,value|
|
625
|
+
setters.each do |setter|
|
626
|
+
setter[1].call(object, value.send(setter[0]))
|
627
|
+
end
|
628
|
+
end
|
629
|
+
end
|
630
|
+
|
631
|
+
if target.to_sym == :skip
|
632
|
+
return lambda do |object, value|
|
633
|
+
object.send(:skip, value)
|
634
|
+
end
|
635
|
+
end
|
636
|
+
|
637
|
+
target = target.to_s
|
638
|
+
if target[-2..-1] == "[]" or (@arrays && array = @arrays.include?(target.to_sym))
|
639
|
+
target = target[0...-2] unless array
|
640
|
+
# Create an attribute accessor is not already defined.
|
641
|
+
begin
|
642
|
+
self.instance_method(target)
|
643
|
+
rescue NameError
|
644
|
+
attr_accessor target
|
645
|
+
end
|
646
|
+
reader = "#{target}".to_sym
|
647
|
+
writer = "#{target}=".to_sym
|
648
|
+
return lambda do |object, value|
|
649
|
+
array = object.send(reader)
|
650
|
+
object.send(writer, array = []) unless array
|
651
|
+
array << value
|
652
|
+
end
|
653
|
+
else
|
654
|
+
# Create an attribute accessor is not already defined.
|
655
|
+
begin
|
656
|
+
self.instance_method(target)
|
657
|
+
rescue NameError
|
658
|
+
attr_accessor target
|
659
|
+
end
|
660
|
+
reader = "#{target}=".to_sym
|
661
|
+
return lambda { |object, value| object.send(reader, value) }
|
662
|
+
end
|
663
|
+
end
|
664
|
+
|
665
|
+
|
666
|
+
def inherited(child)
|
667
|
+
super
|
668
|
+
# Duplicate options, rules and arrays rules to any inherited class.
|
669
|
+
child.options.update self.options
|
670
|
+
child.rules.concat self.rules
|
671
|
+
child.instance_variable_set :@arrays, self.instance_variable_get(:@arrays)
|
672
|
+
end
|
673
|
+
|
674
|
+
end
|
675
|
+
|
676
|
+
|
677
|
+
unless const_defined? :READER_OPTIONS
|
678
|
+
READER_OPTIONS = [:last_modified, :etag, :redirect_limit, :user_agent, :timeout]
|
679
|
+
end
|
680
|
+
|
681
|
+
|
682
|
+
# Set to true when the first extractor returns true.
|
683
|
+
attr_accessor :extracted
|
684
|
+
|
685
|
+
|
686
|
+
# Information about the HTML page scraped. See PageInfo.
|
687
|
+
attr_accessor :page_info
|
688
|
+
|
689
|
+
|
690
|
+
# Returns the options for this object.
|
691
|
+
attr_accessor :options
|
692
|
+
|
693
|
+
|
694
|
+
# Create a new scraper instance.
|
695
|
+
#
|
696
|
+
# The argument +source+ is a URL, string containing HTML, or HTML::Node.
|
697
|
+
# The optional argument +options+ are options passed to the scraper.
|
698
|
+
# See Base#scrape for more details.
|
699
|
+
#
|
700
|
+
# For example:
|
701
|
+
# # The page we want to scrape
|
702
|
+
# url = URI.parse("http://example.com")
|
703
|
+
# # Skip the header
|
704
|
+
# scraper = MyScraper.new(url, :root_element=>"body")
|
705
|
+
# result = scraper.scrape
|
706
|
+
def initialize(source, options = nil)
|
707
|
+
@page_info = PageInfo[]
|
708
|
+
@options = options || {}
|
709
|
+
case source
|
710
|
+
when URI
|
711
|
+
@document = source
|
712
|
+
when String, HTML::Node
|
713
|
+
@document = source
|
714
|
+
# TODO: document and test case these two.
|
715
|
+
@page_info.url = @page_info.original_url = @options[:url]
|
716
|
+
@page_info.encoding = @options[:encoding]
|
717
|
+
else
|
718
|
+
raise ArgumentError, "Can only scrape URI, String or HTML::Node"
|
719
|
+
end
|
720
|
+
end
|
721
|
+
|
722
|
+
|
723
|
+
# Scrapes the document and returns the result.
|
724
|
+
#
|
725
|
+
# If the scraper was created with a URL, retrieve the page and parse it.
|
726
|
+
# If the scraper was created with a string, parse the page.
|
727
|
+
#
|
728
|
+
# The result is returned by calling the #result method. The default
|
729
|
+
# implementation returns +self+ if any extractor returned true,
|
730
|
+
# +nil+ otherwise.
|
731
|
+
#
|
732
|
+
# The method may raise any number of exceptions. HTTPError indicates
|
733
|
+
# it failed to retrieve the HTML page, and HTMLParseError that it failed
|
734
|
+
# to parse the page. Other exceptions come from extractors and the
|
735
|
+
# #result method.
|
736
|
+
#
|
737
|
+
# See also Base#scrape.
|
738
|
+
def scrape()
|
739
|
+
# Call prepare with the document, but before doing anything else.
|
740
|
+
prepare document
|
741
|
+
# Retrieve the document. This may raise HTTPError or HTMLParseError.
|
742
|
+
case document
|
743
|
+
when Array: stack = @document.reverse # see below
|
744
|
+
when HTML::Node:
|
745
|
+
# If a root element is specified, start selecting from there.
|
746
|
+
# The stack is empty if we can't find any root element (makes
|
747
|
+
# sense). However, the node we're going to process may be
|
748
|
+
# a tag, or an HTML::Document.root which is the equivalent of
|
749
|
+
# a document fragment.
|
750
|
+
root_element = option(:root_element)
|
751
|
+
root = root_element ? @document.find(:tag=>root_element) : @document
|
752
|
+
stack = root ? (root.tag? ? [root] : root.children.reverse) : []
|
753
|
+
else return
|
754
|
+
end
|
755
|
+
# @skip stores all the elements we want to skip (see #skip).
|
756
|
+
# rules stores all the rules we want to process with this
|
757
|
+
# scraper, based on the class definition.
|
758
|
+
@skip = []
|
759
|
+
@stop = false
|
760
|
+
rules = self.class.rules.clone
|
761
|
+
begin
|
762
|
+
# Process the document one node at a time. We process elements
|
763
|
+
# from the end of the stack, so each time we visit child elements,
|
764
|
+
# we add them to the end of the stack in reverse order.
|
765
|
+
while node = stack.pop
|
766
|
+
break if @stop
|
767
|
+
skip_this = false
|
768
|
+
# Only match nodes that are elements, ignore text nodes.
|
769
|
+
# Also ignore any element that's on the skip list, and if
|
770
|
+
# found one, remove it from the list (since we never visit
|
771
|
+
# the same element twice). But an element may be added twice
|
772
|
+
# to the skip list.
|
773
|
+
# Note: equal? is faster than == for nodes.
|
774
|
+
next unless node.tag?
|
775
|
+
@skip.delete_if { |s| skip_this = true if s.equal?(node) }
|
776
|
+
next if skip_this
|
777
|
+
|
778
|
+
# Run through all the rules until we process the element or
|
779
|
+
# run out of rules. If skip_this=true then we processed the
|
780
|
+
# element and we can break out of the loop. However, we might
|
781
|
+
# process (and skip) descedants so also watch the skip list.
|
782
|
+
rules.delete_if do |selector, extractor, rule_name, first_only|
|
783
|
+
break if skip_this
|
784
|
+
# The result of calling match (selected) is nil, element
|
785
|
+
# or array of elements. We turn it into an array to
|
786
|
+
# process one element at a time. We process all elements
|
787
|
+
# that are not on the skip list (we haven't visited
|
788
|
+
# them yet).
|
789
|
+
if selected = selector.match(node, first_only)
|
790
|
+
selected = [selected] unless selected.is_a?(Array)
|
791
|
+
selected = [selected.first] if first_only
|
792
|
+
selected.each do |element|
|
793
|
+
# Do not process elements we already skipped
|
794
|
+
# (see above). However, this time we may visit
|
795
|
+
# an element twice, since selected elements may
|
796
|
+
# be descendants of the current element on the
|
797
|
+
# stack. In rare cases two elements on the stack
|
798
|
+
# may pick the same descendants.
|
799
|
+
next if @skip.find { |s| s.equal?(element) }
|
800
|
+
# Call the extractor method with this element.
|
801
|
+
# If it returns true, skip the element and if
|
802
|
+
# the current element, don't process any more
|
803
|
+
# rules. Again, pay attention to descendants.
|
804
|
+
skip = extractor.bind(self).call(element)
|
805
|
+
if (skip || @skip.delete(true)) && @skip.delete(false).nil?
|
806
|
+
@extracted = true
|
807
|
+
if element.equal?(node)
|
808
|
+
skip_this = true
|
809
|
+
else
|
810
|
+
@skip << element
|
811
|
+
end
|
812
|
+
end
|
813
|
+
end
|
814
|
+
first_only if !selected.empty?
|
815
|
+
end
|
816
|
+
end
|
817
|
+
|
818
|
+
# If we did not skip the element, we're going to process its
|
819
|
+
# children. Reverse order since we're popping from the stack.
|
820
|
+
if !skip_this && children = node.children
|
821
|
+
stack.concat children.reverse
|
822
|
+
end
|
823
|
+
end
|
824
|
+
ensure
|
825
|
+
@skip = nil
|
826
|
+
end
|
827
|
+
return result
|
828
|
+
end
|
829
|
+
|
830
|
+
|
831
|
+
# Returns the document being processed.
|
832
|
+
#
|
833
|
+
# If the scraper was created with a URL, this method will attempt to
|
834
|
+
# retrieve the page and parse it.
|
835
|
+
#
|
836
|
+
# If the scraper was created with a string, this method will attempt
|
837
|
+
# to parse the page.
|
838
|
+
#
|
839
|
+
# Be advised that calling this method may raise an exception
|
840
|
+
# (HTTPError or HTMLParseError).
|
841
|
+
#
|
842
|
+
# The document is parsed only the first time this method is called.
|
843
|
+
def document
|
844
|
+
if @document.is_a?(URI)
|
845
|
+
# Attempt to read page. May raise HTTPError.
|
846
|
+
options = {}
|
847
|
+
READER_OPTIONS.each { |key| options[key] = option(key) }
|
848
|
+
request(@document, options)
|
849
|
+
end
|
850
|
+
if @document.is_a?(String)
|
851
|
+
# Parse the page. May raise HTMLParseError.
|
852
|
+
parsed = Reader.parse_page(@document, @page_info.encoding,
|
853
|
+
option(:parser_options), option(:parser))
|
854
|
+
@document = parsed.document
|
855
|
+
@page_info.encoding = parsed.encoding
|
856
|
+
end
|
857
|
+
return @document if @document.is_a?(HTML::Node)
|
858
|
+
raise RuntimeError, "No document to process"
|
859
|
+
end
|
860
|
+
|
861
|
+
|
862
|
+
def request(url, options)
|
863
|
+
if page = Reader.read_page(@document, options)
|
864
|
+
@page_info.url = page.url
|
865
|
+
@page_info.original_url = @document
|
866
|
+
@page_info.last_modified = page.last_modified
|
867
|
+
@page_info.etag = page.etag
|
868
|
+
@page_info.encoding = page.encoding
|
869
|
+
@document = page.content
|
870
|
+
end
|
871
|
+
end
|
872
|
+
|
873
|
+
|
874
|
+
# :call-seq:
|
875
|
+
# skip() => true
|
876
|
+
# skip(element) => true
|
877
|
+
# skip([element ...]) => true
|
878
|
+
#
|
879
|
+
# Skips processing the specified element(s).
|
880
|
+
#
|
881
|
+
# If called with a single element, that element will not be processed.
|
882
|
+
#
|
883
|
+
# If called with an array of elements, all the elements in the array
|
884
|
+
# are skipped.
|
885
|
+
#
|
886
|
+
# If called with no element, skips processing the current element.
|
887
|
+
# This has the same effect as returning true.
|
888
|
+
#
|
889
|
+
# For convenience this method always returns true. For example:
|
890
|
+
# process "h1" do |element|
|
891
|
+
# @header = element
|
892
|
+
# skip
|
893
|
+
# end
|
894
|
+
def skip(elements = nil)
|
895
|
+
case elements
|
896
|
+
when Array: @skip.concat elements
|
897
|
+
when HTML::Node: @skip << elements
|
898
|
+
when nil: @skip << self.element
|
899
|
+
when true, false: @skip << elements
|
900
|
+
end
|
901
|
+
# Calling skip(element) as the last statement is
|
902
|
+
# redundant by design.
|
903
|
+
return true
|
904
|
+
end
|
905
|
+
|
906
|
+
|
907
|
+
# Stops processing this page. You can call this early on if you
|
908
|
+
# discover there is no interesting information on the page, or
|
909
|
+
# done extracting all useful information.
|
910
|
+
def stop()
|
911
|
+
@stop = true
|
912
|
+
end
|
913
|
+
|
914
|
+
|
915
|
+
# Called by #scrape after creating the document, but before running
|
916
|
+
# any processing rules.
|
917
|
+
#
|
918
|
+
# You can override this method to do any preparation work.
|
919
|
+
def prepare(document)
|
920
|
+
end
|
921
|
+
|
922
|
+
|
923
|
+
# Returns the result of a succcessful scrape.
|
924
|
+
#
|
925
|
+
# This method is called by #scrape after running all the rules on the
|
926
|
+
# document. You can also call it directly.
|
927
|
+
#
|
928
|
+
# Override this method to return a specific object, perform post-scraping
|
929
|
+
# processing, validation, etc.
|
930
|
+
#
|
931
|
+
# The default implementation returns +self+ if any extractor returned
|
932
|
+
# true, +nil+ otherwise.
|
933
|
+
#
|
934
|
+
# If you override this method, implement your own logic to determine
|
935
|
+
# if anything was extracted and return +nil+ otherwise. Also, make sure
|
936
|
+
# calling this method multiple times returns the same result.
|
937
|
+
def result()
|
938
|
+
return self if @extracted
|
939
|
+
end
|
940
|
+
|
941
|
+
|
942
|
+
# Returns the value of an option.
|
943
|
+
#
|
944
|
+
# Returns the value of an option passed to the scraper on creation.
|
945
|
+
# If not specified, return the value of the option set for this
|
946
|
+
# scraper class. Options are inherited from the parent class.
|
947
|
+
def option(symbol)
|
948
|
+
return options.has_key?(symbol) ? options[symbol] : self.class.options[symbol]
|
949
|
+
end
|
950
|
+
|
951
|
+
|
952
|
+
end
|
953
|
+
|
954
|
+
|
955
|
+
# Define an anonymous scraper and returns the class.
|
956
|
+
#
|
957
|
+
# For example:
|
958
|
+
# links = Scraper.define do
|
959
|
+
# process "a[href]", :urls=>"@href"
|
960
|
+
# result :urls
|
961
|
+
# end
|
962
|
+
#
|
963
|
+
# puts links.scrape(html)
|
964
|
+
def self.define(&block)
|
965
|
+
kls = Class.new(Scraper::Base)
|
966
|
+
kls.module_eval &block
|
967
|
+
return kls
|
968
|
+
end
|
969
|
+
|
970
|
+
end
|