scrapes 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README +123 -0
- data/demo/demo.rb +33 -0
- data/demo/pages/about.rb +32 -0
- data/demo/pages/main.rb +32 -0
- data/lib/scrapes.rb +41 -0
- data/lib/scrapes/cache.rb +110 -0
- data/lib/scrapes/cookbook.rb +53 -0
- data/lib/scrapes/cookies.rb +45 -0
- data/lib/scrapes/crawler.rb +97 -0
- data/lib/scrapes/hpricot.rb +110 -0
- data/lib/scrapes/initializer.rb +86 -0
- data/lib/scrapes/page.rb +319 -0
- data/lib/scrapes/rule_parser.rb +327 -0
- data/lib/scrapes/session.rb +155 -0
- data/lib/scrapes/to_proxy.rb +50 -0
- data/test/cache.rb +75 -0
- data/test/cookies.rb +34 -0
- data/test/crawler.rb +69 -0
- data/test/hpricot.rb +55 -0
- data/test/initializer.rb +54 -0
- data/test/lib/server.rb +63 -0
- data/test/page.rb +77 -0
- data/test/pages/foils.rb +61 -0
- data/test/pages/foils2.rb +38 -0
- data/test/pages/redhanded_entries.rb +36 -0
- data/test/pages/redhanded_main.rb +58 -0
- data/test/pages/rule_parser.rb +81 -0
- data/test/pages/simple.rb +21 -0
- data/test/public/foil72.html +10 -0
- data/test/public/foil73.html +9 -0
- data/test/public/foil74.html +11 -0
- data/test/public/foo.txt +1 -0
- data/test/public/index.html +20 -0
- data/test/public/redhanded.html +1208 -0
- data/test/public/rule_parser.html +21 -0
- data/test/public/simple.html +8 -0
- data/test/rule_parser.rb +151 -0
- data/test/session.rb +45 -0
- data/test/textcontent.rb +71 -0
- metadata +123 -0
@@ -0,0 +1,327 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
#--
|
26
|
+
# This started as a branch of the uformatparser lib by:
|
27
|
+
# Author:: Assaf Arkin assaf@labnotes.org
|
28
|
+
# Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
|
29
|
+
# Copyright:: Copyright (c) 2005 Assaf Arkin
|
30
|
+
# License:: Creative Commons Attribution-ShareAlike
|
31
|
+
# Rewrite and Hpricot support by Michael Garriss
|
32
|
+
#++
|
33
|
+
################################################################################
|
34
|
+
require 'yaml'
|
35
|
+
require 'scrapes/hpricot'
|
36
|
+
################################################################################
|
37
|
+
module Scrapes
|
38
|
+
################################################################################
|
39
|
+
# The methods defined here are available at the class scope level of a Scrapes::Page
|
40
|
+
# subclass. For example:
|
41
|
+
# class Foobar < Scrapes::Page
|
42
|
+
# rule :foo, 'foo'
|
43
|
+
# rule_1 :bar, 'bar', 'text()'
|
44
|
+
# end
|
45
|
+
#--
|
46
|
+
# === Using <tt>rule</tt>
|
47
|
+
# === Using <tt>rule_1</tt>
|
48
|
+
# === Using <tt>selector</tt>
|
49
|
+
# === Using <tt>extractor</tt>
|
50
|
+
#++
|
51
|
+
module RuleParser
|
52
|
+
################################################################################
|
53
|
+
# name:: the name later used to invoke this rule
|
54
|
+
# select:: the selector to use, String or Symbol
|
55
|
+
# extract:: the extractor to use, String, Symbol, or Class. See RuleParser#extractor
|
56
|
+
# limit:: the limit of nodes to send to extractor
|
57
|
+
# block:: a block extractor, must not be defined if extract is non-nil
|
58
|
+
# Example:
|
59
|
+
# class Foobar < Scrapes::Page
|
60
|
+
# rule :foo, 'foo'
|
61
|
+
# end
|
62
|
+
# Later it's used as an instance method on the Scrapes::Page objects like this:
|
63
|
+
# foobar.foo.each do |foo|
|
64
|
+
# example.attr << foo
|
65
|
+
# end
|
66
|
+
def rule(name, select = '', extract = nil, limit = -1, &block)
|
67
|
+
raise InvalidRuleException, "First argument (rule name) is required" unless name
|
68
|
+
attr name, true
|
69
|
+
self.rules << Rule.new(name, selector(nil,select), extractor(nil,extract,&block), limit)
|
70
|
+
end
|
71
|
+
|
72
|
+
################################################################################
|
73
|
+
# Almost the same as rule except forces limit to be 1. The other difference is
|
74
|
+
# that RuleParser#rule returns collections of mathes (an Array or size 1 even) where as
|
75
|
+
# RuleParser#rule_1 just returns the match.
|
76
|
+
# name:: the name later used to invoke this rule
|
77
|
+
# select:: the selector to use, String or Symbol
|
78
|
+
# extract:: the extractor to use, String, Symbol, or Class
|
79
|
+
# block:: a block extractor, must not be defined if extract is non-nil
|
80
|
+
# Example:
|
81
|
+
# class Foobar < Scrapes::Page
|
82
|
+
# rule_1 :bar, 'tr'
|
83
|
+
# end
|
84
|
+
# Later it's used as an instance method on the Scrapes::Page objects like this:
|
85
|
+
# example.attr = foobar.bar
|
86
|
+
def rule_1(name, selector = '', extractor = nil, &block)
|
87
|
+
rule(name, selector, extractor, 1, &block)
|
88
|
+
end
|
89
|
+
|
90
|
+
################################################################################
|
91
|
+
# Creates a standalone selector that can later be used in a rule. Example:
|
92
|
+
# class Foobar < Scrapes::Page
|
93
|
+
# selector :foo_select, 'table'
|
94
|
+
# rule_1 :bar, :foo_select # a Symbol triggers use of the selector
|
95
|
+
# end
|
96
|
+
# name:: the name later used to invoke this selector
|
97
|
+
# select:: the selector to use, String or NilClass
|
98
|
+
# block:: a block selector, must not be defined if select is non-nil
|
99
|
+
# A block selector is yielded the Hpricot doc object just once. The collection it
|
100
|
+
# returns is interated over and each match is passed to the extractor. Example:
|
101
|
+
# class Foobar < Scrapes::Page
|
102
|
+
# selector :foo_select_2 do |hpricot_doc|
|
103
|
+
# doc.search('table')
|
104
|
+
# end
|
105
|
+
# rule_1 :bar, :foo_select_2 # a Symbol triggers use of the selector
|
106
|
+
# end
|
107
|
+
# String selectors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted as Hpricot
|
108
|
+
# search strings. See http://code.whytheluckystiff.net/hpricot/wiki/AnHpricotShowcase
|
109
|
+
def selector(name, select = nil, &block)
|
110
|
+
tor '@selector', name, select, &block
|
111
|
+
end
|
112
|
+
|
113
|
+
################################################################################
|
114
|
+
# Creates a standalone extractor that can later be used in a rule. Example:
|
115
|
+
# class Foobar < Scrapes::Page
|
116
|
+
# extractor :mailto_extract do |elem|
|
117
|
+
# elem.attributes['href'].sub(/mailto:/,'') # remove the mailto: string
|
118
|
+
# end
|
119
|
+
# rule :emails, 'a[@href^="mailto:"]', :mailto_extract
|
120
|
+
# end
|
121
|
+
# name:: the name later used to invoke this selector
|
122
|
+
# extract:: the extractor to use, String or NilClass
|
123
|
+
# block:: a block extractor, must not be defined if extract is non-nil
|
124
|
+
# A block extractor is yielded each object that matched the rules's selector.
|
125
|
+
#
|
126
|
+
# Extractors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted based on
|
127
|
+
# the class of the extractor as follows
|
128
|
+
# ==== NilClass
|
129
|
+
# The result of the selector is just re-returned. Thus <tt>foo.my_rule</tt> would
|
130
|
+
# just return the selector results defined on the :my_rule rule.
|
131
|
+
# ==== Symbol
|
132
|
+
# An custom extractor is used. See above docs for this method for an example.
|
133
|
+
# ==== Class
|
134
|
+
# A nested class of the given name is used as a new inner-parser. An instance of that
|
135
|
+
# class is returned from each invocation of the extractor. Example:
|
136
|
+
# class Outer < Scrapes::Page
|
137
|
+
# class Inner < Scrapes::Page
|
138
|
+
# rule_1 :bold_text, 'b', 'text()'
|
139
|
+
# rule_1 :img_src, 'img[@src]', '@src'
|
140
|
+
# end
|
141
|
+
# rule :items, 'tr', Inner
|
142
|
+
# end
|
143
|
+
# Now calling <tt>my_page.items</tt> returns an Array of Inner objects that each
|
144
|
+
# separately parses out the bold text and image source of each table row in the
|
145
|
+
# document.
|
146
|
+
# ==== String
|
147
|
+
# Two patterns:
|
148
|
+
# @foobar:: extract out the contents of an attibute named 'foobar'
|
149
|
+
# foobar():: invoke the foobar builtin extractor, see Scrapes::Hpricot::Extractors
|
150
|
+
def extractor(name, extract = nil, &block)
|
151
|
+
tor '@extractor', name, extract, &block
|
152
|
+
end
|
153
|
+
|
154
|
+
################################################################################
|
155
|
+
def parse(node, context = nil, rules = nil) # :nodoc:
|
156
|
+
context = self.new() unless context
|
157
|
+
rules = self.rules unless rules
|
158
|
+
if rules
|
159
|
+
rules.each_with_index do |rule, index|
|
160
|
+
if rule and rule.process(node, context)
|
161
|
+
less_rules = rules.clone unless less_rules
|
162
|
+
less_rules[index] = nil
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
context
|
167
|
+
end
|
168
|
+
|
169
|
+
################################################################################
|
170
|
+
def rules() # :nodoc:
|
171
|
+
@microparser_rules ||= []
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
################################################################################
|
177
|
+
def tor(type, name, tor_arg = nil, &block)
|
178
|
+
raise InvalidRuleException, "can't use both arg and block" if tor_arg and block
|
179
|
+
result = case (tor_arg ||= block)
|
180
|
+
when NilClass then proc {|node| node}
|
181
|
+
when String
|
182
|
+
if type == '@selector'
|
183
|
+
proc {|node| node.search(tor_arg)}
|
184
|
+
else
|
185
|
+
Extractor.new self, tor_arg
|
186
|
+
end
|
187
|
+
when Proc, Method then tor_arg
|
188
|
+
when Symbol then proc {|node| send(tor_arg,node) }
|
189
|
+
when Class
|
190
|
+
begin
|
191
|
+
tor_arg.method(:parse)
|
192
|
+
rescue NameError=>error
|
193
|
+
raise InvalidRuleException,
|
194
|
+
"Selector class must implement the method parse", error.backtrace
|
195
|
+
end
|
196
|
+
tor_arg
|
197
|
+
else
|
198
|
+
raise InvalidRuleException,
|
199
|
+
"Invalid tor type: must be a string, parser class, block or nil"
|
200
|
+
end
|
201
|
+
# TODO dry
|
202
|
+
if type == "@selector"
|
203
|
+
self.class.class_eval { (@selector ||= {})[name] = result }
|
204
|
+
class_def(name) do |node|
|
205
|
+
self.class.class_eval { @selector[name].call(node) }
|
206
|
+
end if name
|
207
|
+
else
|
208
|
+
self.class.class_eval { (@extractor ||= {})[name] = result }
|
209
|
+
class_def(name) do |node|
|
210
|
+
self.class.class_eval { @extractor[name].call(node) }
|
211
|
+
end if name
|
212
|
+
end
|
213
|
+
result
|
214
|
+
end
|
215
|
+
|
216
|
+
################################################################################
|
217
|
+
def self.included(mod) # :nodoc:
|
218
|
+
mod.extend(self)
|
219
|
+
mod.extend(Scrapes::Hpricot::Extractors)
|
220
|
+
end
|
221
|
+
|
222
|
+
################################################################################
|
223
|
+
class Rule #:nodoc:all
|
224
|
+
attr :name
|
225
|
+
attr :limit,true
|
226
|
+
attr :selector
|
227
|
+
attr :extractor
|
228
|
+
|
229
|
+
################################################################################
|
230
|
+
def initialize(name, selector, extractor, limit)
|
231
|
+
@name, @selector, @extractor, @limit = name.to_s.intern, selector, extractor, limit
|
232
|
+
end
|
233
|
+
|
234
|
+
################################################################################
|
235
|
+
def process(node, context)
|
236
|
+
context.instance_variable_set '@hpricot', node
|
237
|
+
return true if @limit == 0
|
238
|
+
result = @selector.call(node)
|
239
|
+
result = [result] unless result.respond_to? :each
|
240
|
+
current = context.instance_variable_set "@#@name", []
|
241
|
+
result.compact.each do |node|
|
242
|
+
value = case @extractor
|
243
|
+
when UnboundMethod then @extractor.bind(context).call(node)
|
244
|
+
when Extractor then @extractor.extract(node)
|
245
|
+
when Proc, Method then @extractor.call(node)
|
246
|
+
when Class then @extractor.parse(node)
|
247
|
+
end
|
248
|
+
next unless value
|
249
|
+
current << value
|
250
|
+
break if current.size == @limit
|
251
|
+
end
|
252
|
+
context.instance_variable_set "@#@name", current[0] if @limit == 1
|
253
|
+
true
|
254
|
+
end
|
255
|
+
|
256
|
+
################################################################################
|
257
|
+
def inspect
|
258
|
+
@selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
################################################################################
|
263
|
+
class Extractor # :nodoc:all
|
264
|
+
# TODO review this
|
265
|
+
# Parse each extractor into three parts:
|
266
|
+
# $1 function name (excluding parentheses)
|
267
|
+
# $2 element name
|
268
|
+
# $3 attribute name (including leading @)
|
269
|
+
# If a match is found the result is either $1, or $2 and/or $3
|
270
|
+
REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
|
271
|
+
|
272
|
+
################################################################################
|
273
|
+
def initialize(context, statement) # :nodoc:
|
274
|
+
statement.strip!
|
275
|
+
@extracts = []
|
276
|
+
statement.split('|').each do |extract|
|
277
|
+
parts = REGEX.match(extract)
|
278
|
+
if parts[1]
|
279
|
+
begin
|
280
|
+
@extracts << context.method(parts[1])
|
281
|
+
rescue NameError=>error
|
282
|
+
raise InvalidRuleException, error.message, error.backtrace
|
283
|
+
end
|
284
|
+
elsif parts[2] and parts[3]
|
285
|
+
attr_name = parts[3][1..-1]
|
286
|
+
@extracts << proc do |node|
|
287
|
+
node.attributes[attr_name] if node.name == parts[2]
|
288
|
+
end
|
289
|
+
elsif parts[2]
|
290
|
+
@extracts << proc { |node| text(node) if node.name == parts[2] }
|
291
|
+
elsif parts[3]
|
292
|
+
attr_name = parts[3][1..-1]
|
293
|
+
@extracts << proc do |node|
|
294
|
+
if node.respond_to? :each
|
295
|
+
node.all.attributes.all[attr_name]
|
296
|
+
else
|
297
|
+
node.attributes[attr_name]
|
298
|
+
end
|
299
|
+
end
|
300
|
+
else
|
301
|
+
raise InvalidRuleException, "Invalid extraction statement"
|
302
|
+
end
|
303
|
+
end
|
304
|
+
raise InvalidRuleException, "Invalid (empty) extraction statement" if
|
305
|
+
@extracts.size == 0
|
306
|
+
end
|
307
|
+
|
308
|
+
################################################################################
|
309
|
+
def extract(node) # :nodoc:
|
310
|
+
value = nil
|
311
|
+
@extracts.find do |extract|
|
312
|
+
value = extract.call(node)
|
313
|
+
end
|
314
|
+
value
|
315
|
+
end
|
316
|
+
|
317
|
+
################################################################################
|
318
|
+
def inspect() # :nodoc:
|
319
|
+
@extracts.join('|')
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
################################################################################
|
324
|
+
class InvalidRuleException < Exception # :nodoc:all
|
325
|
+
end
|
326
|
+
end
|
327
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'scrapes/crawler'
|
26
|
+
|
27
|
+
module Scrapes
|
28
|
+
################################################################################
|
29
|
+
# Session is used to process all web pages under a single session. This may
|
30
|
+
# be necessary when some web sites need you to login, or otherwise create
|
31
|
+
# a session ID with a cookie before you can continue processing pages.
|
32
|
+
class Session
|
33
|
+
################################################################################
|
34
|
+
attr_reader :log
|
35
|
+
|
36
|
+
################################################################################
|
37
|
+
attr_accessor :post
|
38
|
+
|
39
|
+
################################################################################
|
40
|
+
attr_accessor :timeout
|
41
|
+
|
42
|
+
################################################################################
|
43
|
+
attr_accessor :cookies
|
44
|
+
|
45
|
+
################################################################################
|
46
|
+
attr_reader :uri
|
47
|
+
|
48
|
+
################################################################################
|
49
|
+
attr_reader :crawler
|
50
|
+
|
51
|
+
################################################################################
|
52
|
+
attr_reader :base_uris
|
53
|
+
|
54
|
+
################################################################################
|
55
|
+
# Start a session using a HTTP GET
|
56
|
+
def self.from_get (uri, &block)
|
57
|
+
session = self.new
|
58
|
+
session.uri = uri
|
59
|
+
block ? yield(session) : session
|
60
|
+
end
|
61
|
+
|
62
|
+
################################################################################
|
63
|
+
# Start a session using HTTP POST
|
64
|
+
def self.from_post (uri, post, &block)
|
65
|
+
session = self.new
|
66
|
+
session.uri = uri
|
67
|
+
session.post = post
|
68
|
+
block ? yield(session) : session
|
69
|
+
end
|
70
|
+
|
71
|
+
################################################################################
|
72
|
+
# Start a session witout having to create a session with the web site first.
|
73
|
+
def self.start (log=nil,&block)
|
74
|
+
session = self.new(log)
|
75
|
+
block ? yield(session) : session
|
76
|
+
end
|
77
|
+
|
78
|
+
################################################################################
|
79
|
+
def initialize log = nil
|
80
|
+
@uri = nil
|
81
|
+
@post = {}
|
82
|
+
@when = Time.at(0)
|
83
|
+
@timeout = 900
|
84
|
+
@cookies = Cookies.new
|
85
|
+
@base_uris = []
|
86
|
+
@crawler = Crawler.new(self)
|
87
|
+
@crawler.log = @log = log
|
88
|
+
@refreshing = false
|
89
|
+
end
|
90
|
+
|
91
|
+
################################################################################
|
92
|
+
def uri= (uri)
|
93
|
+
@uri = uri
|
94
|
+
@base_uris << uri
|
95
|
+
end
|
96
|
+
|
97
|
+
################################################################################
|
98
|
+
# Process a web page
|
99
|
+
def page (page_class, link, post={}, &block)
|
100
|
+
return if link.nil?
|
101
|
+
link = [link] unless link.respond_to?(:to_ary)
|
102
|
+
block ||= lambda {|data| data}
|
103
|
+
result = nil
|
104
|
+
|
105
|
+
link.each do |u|
|
106
|
+
fetch(u, post) do |res|
|
107
|
+
result = page_class.extract(res.body, u, self, &block)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
result
|
112
|
+
end
|
113
|
+
|
114
|
+
################################################################################
|
115
|
+
# Fetch a URL in the session, but without a Scrapes::Page
|
116
|
+
def fetch (uri, post={}, &block)
|
117
|
+
u = absolute_uri(uri)
|
118
|
+
@base_uris.push(u)
|
119
|
+
yield(@crawler.fetch(u, post))
|
120
|
+
@base_uris.pop
|
121
|
+
end
|
122
|
+
|
123
|
+
################################################################################
|
124
|
+
# Refresh the session, sometimes necessary when you are getting pages out of the
|
125
|
+
# cache, but then go to the real web site and the session has expired.
|
126
|
+
def refresh
|
127
|
+
if !@refreshing and @uri and (Time.now - @when) > @timeout
|
128
|
+
begin
|
129
|
+
@refreshing = true
|
130
|
+
@when = Time.now
|
131
|
+
@cookies.clear
|
132
|
+
|
133
|
+
@crawler.cache.without_cache do
|
134
|
+
@crawler.fetch(uri, post)
|
135
|
+
end
|
136
|
+
ensure
|
137
|
+
@refreshing = false
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
self
|
142
|
+
end
|
143
|
+
|
144
|
+
################################################################################
|
145
|
+
# Convert a relative URI to an absolute URI
|
146
|
+
def absolute_uri (uri)
|
147
|
+
return uri if @base_uris.empty?
|
148
|
+
base = URI.parse(@base_uris.last)
|
149
|
+
base.merge(uri).to_s
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
################################################################################
|
154
|
+
end
|
155
|
+
################################################################################
|