scrapes 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,45 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ module Scrapes
26
+ ################################################################################
27
+ # Make it easy to access HTTP cookies
28
+ class Cookies < Hash
29
+ ################################################################################
30
+ # Convert the current set of cookies into HTTP headers.
31
+ def to_header
32
+ map {|k,v| "#{k}=#{v}"}.join(';')
33
+ end
34
+
35
+ ################################################################################
36
+ # Parse HTTP cookie headers
37
+ def from_header (header)
38
+ k, v = header.sub(/;.*$/, '').split(/\s*=\s*/, 2)
39
+ self[k] = v
40
+ end
41
+
42
+ end
43
+ ################################################################################
44
+ end
45
+ ################################################################################
@@ -0,0 +1,97 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'net/http'
26
+ require 'pathname'
27
+ require 'scrapes/cache'
28
+ ################################################################################
29
+ module Scrapes
30
+ ################################################################################
31
+ # Try to suck down a URI
32
+ class Crawler
33
+ ################################################################################
34
+ # The cache object that this crawler is using
35
+ attr_accessor :cache
36
+
37
+ ################################################################################
38
+ # The optional log object that this crawler is using
39
+ attr_accessor :log
40
+
41
+ ################################################################################
42
+ # Create a new crawler for the given session
43
+ def initialize (session)
44
+ @session = session
45
+ @log = nil
46
+ @verbose = 0
47
+ @delay = 0.5
48
+ @cache = Cache.new
49
+ end
50
+
51
+ ################################################################################
52
+ # Fetch a URI, using HTTP GET unless you supply <tt>post</tt>.
53
+ def fetch (uri, post={}, headers={})
54
+ @session.refresh
55
+ uri = URI.parse(@session.absolute_uri(uri))
56
+
57
+ post.empty? and cached = @cache.check(uri)
58
+ @log.info((cached ? 'C ' : 'N ') + uri.to_s) if @log
59
+
60
+ return cached if cached # FIXME
61
+ sleep(@delay) if @delay != 0
62
+
63
+ path = uri.path.dup
64
+ path << "/" if path.empty?
65
+ path << "?" + uri.query if uri.query
66
+
67
+ req = post.empty? ? Net::HTTP::Get.new(path) : Net::HTTP::Post.new(path)
68
+ req.set_form_data(post) unless post.empty?
69
+
70
+ req['Cookie'] = @session.cookies.to_header
71
+ headers.each {|k,v| req[k] = v}
72
+
73
+ res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req)}
74
+
75
+ if @verbose >= 2
76
+ STDERR.puts "-----------------------------------------------"
77
+ STDERR.puts res.class
78
+ res.each_header {|k,v| STDERR.puts "#{k}: #{v}"}
79
+ end
80
+
81
+ # FIXME, what to do about more than one cookie
82
+ @session.cookies.from_header(res['set-cookie']) if res.key?('set-cookie')
83
+
84
+ case res
85
+ when Net::HTTPRedirection
86
+ @session.base_uris[-1] = @session.absolute_uri(res['location'])
87
+ res = fetch(res['location'], {}, headers)
88
+ end
89
+
90
+ post.empty? and @cache.update(uri, res.body)
91
+ res
92
+ end
93
+
94
+ end
95
+ ################################################################################
96
+ end
97
+ ################################################################################
@@ -0,0 +1,110 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'cgi'
26
+ require 'hpricot'
27
+ ################################################################################
28
+ module Scrapes
29
+ ################################################################################
30
+ module Hpricot # :nodoc:
31
+ ################################################################################
32
+ module Extractors
33
+ ################################################################################
34
+ # Returns the text of any child text nodes recursively concatenated.
35
+ def text(node)
36
+ text_process(node,String) do |e| text(e) end
37
+ end
38
+
39
+ ################################################################################
40
+ # Returns the text of any child text nodes recursively as nested Array.
41
+ def texts(node)
42
+ text_process(node,Array) do |e| texts(e) end
43
+ end
44
+
45
+ ################################################################################
46
+ # Returns the text of any child text nodes concatenated.
47
+ def content(node)
48
+ text_process(node,String) do |e| e.content end
49
+ end
50
+
51
+ ################################################################################
52
+ # Returns the text of any child text nodes as an Array.
53
+ def contents(node)
54
+ text_process(node,Array) do |e| e.content end
55
+ end
56
+
57
+ ################################################################################
58
+ # The result of text() with whitespace reduceded to single spaces and striped.
59
+ def word(node)
60
+ text_process(node,String) do |e| word(e).gsub(/\s+/,' ').strip end
61
+ end
62
+
63
+ ################################################################################
64
+ # The result of texts() striped, flattened, whitespace reduced to single spaces, and
65
+ # with all blank?s rejected.
66
+ def words(node)
67
+ texts(node).flatten.compact.map{|e|e.gsub(/\s+/,' ').strip}.reject{|e| e.blank?}
68
+ end
69
+
70
+ ################################################################################
71
+ # Just reuturn the yielded node.
72
+ def xml(node)
73
+ node
74
+ end
75
+
76
+ protected
77
+ ################################################################################
78
+ def unescape
79
+ case result = yield
80
+ when String then CGI::unescapeHTML(result).gsub('&nbsp;', ' ')
81
+ when Array then result.map{|e| Extractors::unescape{e}}
82
+ when NilClass then nil
83
+ else raise "should be Array or String, was: #{result.class}"
84
+ end
85
+ end
86
+ ################################################################################
87
+ def text_process(node, klass, &block)
88
+ Extractors::unescape do
89
+ case node
90
+ when Array, ::Hpricot::Elements
91
+ node.map do |elem|
92
+ text_process(elem,klass,&block)
93
+ end
94
+ when ::Hpricot::Elem, ::Hpricot::Doc
95
+ node.children.inject(klass.new) do |value,child|
96
+ (value << block.call(child)) rescue nil
97
+ value
98
+ end
99
+ when ::Hpricot::Text then node.content
100
+ end
101
+ end
102
+ end
103
+
104
+ module_function :word, :words, :text, :texts, :content, :contents, :text_process
105
+ end
106
+ ################################################################################
107
+ end
108
+ ################################################################################
109
+ end
110
+ ################################################################################
@@ -0,0 +1,86 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ module Scrapes
26
+ ################################################################################
27
+ # Initialize the Scrapes library
28
+ class Initializer
29
+ ################################################################################
30
+ # The directory name where the pages classes are kept
31
+ attr_accessor :pages_dir
32
+
33
+ ################################################################################
34
+ # The parent directory where the pages_dir can be found
35
+ attr_accessor :pages_parent
36
+
37
+ ################################################################################
38
+ # Create a new Initializer and run it
39
+ def self.run (&block)
40
+ initializer = self.new
41
+ yield initializer if block
42
+ initializer
43
+ end
44
+
45
+ ################################################################################
46
+ # Establish all the defaults
47
+ def initialize
48
+ @pages_dir = 'pages'
49
+ @pages_parent = File.dirname($0)
50
+ end
51
+
52
+ ################################################################################
53
+ # Run all the initilization methods
54
+ def process
55
+ load_pages
56
+ end
57
+
58
+ ################################################################################
59
+ private
60
+
61
+ ################################################################################
62
+ # load all files in the pages directory
63
+ def load_pages
64
+ reloader(Dir.glob(@pages_parent + '/' + @pages_dir + '/*.rb').sort)
65
+ end
66
+
67
+ ################################################################################
68
+ # try to keep loading files until all NameError issues are resolved
69
+ def reloader (files, limit=4)
70
+ reload = []
71
+
72
+ files.each do |file|
73
+ begin
74
+ load File.expand_path(file)
75
+ rescue NameError
76
+ raise if limit <= 0
77
+ reload << file
78
+ end
79
+ end
80
+
81
+ reloader(reload, limit - 1) unless reload.empty?
82
+ end
83
+
84
+ end
85
+ end
86
+ ################################################################################
@@ -0,0 +1,319 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'scrapes/rule_parser'
26
+ require 'hpricot'
27
+ require 'rextra'
28
+ ################################################################################
29
+ module Scrapes
30
+ ################################################################################
31
+ # The page class is used as a base class for scraping data out of one web
32
+ # page. To use it, you inherit from it and setup some rules. You can also
33
+ # use validators to ensure that the page was scraped correctly.
34
+ #
35
+ # == Setup
36
+ #
37
+ # class MyPageScraper < Scrapes::Page
38
+ # rule :rule_name, blah
39
+ # end
40
+ # Scrapes::RuleParser explains the use of rules.
41
+ #
42
+ # == Auto Loading
43
+ #
44
+ # Scrapes will automatically 'require' ruby files placed in a special 'pages' directory.
45
+ # The idea is to place one Scrapes::Page derived class per file in the pages directory,
46
+ # and have it required for you.
47
+ #
48
+ # == Validations
49
+ #
50
+ # There are a few class methods that you can use to validate the contents you scraped
51
+ # from a given web page.
52
+ class Page
53
+ include Scrapes::Hpricot::Extractors
54
+
55
+ XSLTPROC = 'xsltproc' # :nodoc
56
+
57
+ ################################################################################
58
+ # RuleParser is used to extract data from web pages using CSS selectors
59
+ # and raw element access by using procs.
60
+ include RuleParser
61
+
62
+ ################################################################################
63
+ # Access the URI where this page's data came from
64
+ attr_accessor :uri
65
+
66
+ ################################################################################
67
+ # Access the session object that was used to fetch this page's data
68
+ attr_accessor :session
69
+
70
+ ################################################################################
71
+ # Access the Hpricot object that the selectors are passed
72
+ attr_accessor :hpricot
73
+
74
+ ################################################################################
75
+ # If the page that you are parsing is paginated (one page in many of similar data)
76
+ # you can use this class method to automatically fetch all pages. In order for this
77
+ # to work, you need to provide a few special methods:
78
+ #
79
+ # === Next Page
80
+ #
81
+ # If you know the URL to the next page, then provide a instance method called
82
+ # <tt>next_page</tt>. It should return the URL for the next page, or nil when
83
+ # the current page is the last page.
84
+ #
85
+ # class NextPageExample < Scrapes::Page
86
+ # rule(:next_page, 'a[href~=next]', '@href', 1)
87
+ # end
88
+ #
89
+ # === Link for Page
90
+ #
91
+ # Alternatively, you can provide a instance method <tt>link_for_page</tt> and
92
+ # another one called <tt>pages</tt>. The <tt>pages</tt> method should return the
93
+ # number of pages in this paginated set. The <tt>link_for_page</tt> method should
94
+ # take a page number, and return a URL to fetch that page.
95
+ #
96
+ # class LinkForPageExample < Scrapes::Page
97
+ # rule_1(:page) {|e| m = e.text.match(/Page\s+\d+\s+of\s+(\d+)/) and m[1].to_i}
98
+ #
99
+ # def link_for_page (page)
100
+ # uri.sub(/page=\d+/, "page=#{page}")
101
+ # end
102
+ # end
103
+ #
104
+ # === Append to Page
105
+ #
106
+ # Finally, you must provide a <tt>append_page</tt> method. It takes an instance
107
+ # of your Scrapes::Page derived class as an argument. Its job is to add the data
108
+ # found on the current page to its instance variables. This is because when you use
109
+ # paginated, it only returns one instance of your class.
110
+ def self.paginated
111
+ meta_eval { @paginated = true }
112
+ end
113
+
114
+ ################################################################################
115
+ # Make Page.extract return an array by calling the given method. This can be
116
+ # very useful for when your class does nothing more than collect a set of links
117
+ # for some other page to process. It cases Session#page to call the given block
118
+ # once for each object returned from method_to_call.
119
+ def self.acts_as_array (method_to_call)
120
+ meta_eval { @as_array = method_to_call }
121
+ end
122
+
123
+ ################################################################################
124
+ # Preprocess the HTML by sending it through an XSLT stylesheet. The stylesheet
125
+ # should return a document that can be then processed using your rules. Using
126
+ # this feature requires that you have the xsltproc utility in your PATH.
127
+ # You can get xsltproc from libxslt: http://xmlsoft.org/XSLT/
128
+ def self.with_xslt (filename)
129
+ raise "#{XSLTPROC} could not be found" unless `#{XSLTPROC} --version 2>&1`.match(/libxslt/)
130
+ meta_eval { @with_xslt = filename }
131
+ end
132
+
133
+ ################################################################################
134
+ # Ensure that the given attributes have been set by matching rules
135
+ def self.validates_presence_of (*attrs)
136
+ attrs, options = attrs_options(attrs, {
137
+ :message => 'rule never matched',
138
+ })
139
+
140
+ validates_from(attrs, options, lambda {|a| !a.nil?})
141
+ end
142
+
143
+ ################################################################################
144
+ # Ensure that the given attributes are not #blank?
145
+ def self.validates_not_blank (*attrs)
146
+ attrs, options = attrs_options(attrs, {
147
+ :message => 'rule never matched',
148
+ })
149
+
150
+ validates_from(attrs, options, lambda {|a| !a.blank?})
151
+ end
152
+
153
+ ################################################################################
154
+ # Ensure that the given attributes have the correct format
155
+ def self.validates_format_of (*attrs)
156
+ attrs, options = attrs_options(attrs, {
157
+ :message => 'did not match regular expression',
158
+ :with => /.*/,
159
+ })
160
+
161
+ validates_from(attrs, options, lambda {|a| a.to_s.match(options[:with])})
162
+ end
163
+
164
+ ################################################################################
165
+ # Ensure that the given attributes have values in the given list
166
+ def self.validates_inclusion_of (*attrs)
167
+ attrs, options = attrs_options(attrs, {
168
+ :message => 'is not in the list of accepted values',
169
+ :in => [],
170
+ })
171
+
172
+ validates_from(attrs, options, lambda {|a| options[:in].include?(a)})
173
+ end
174
+
175
+ ################################################################################
176
+ # Ensure that the given attribute is a number
177
+ def self.validates_numericality_of (*attrs)
178
+ attrs, options = attrs_options(attrs, {
179
+ :message => 'is not a number',
180
+ })
181
+
182
+ closure = lambda do |a|
183
+ begin
184
+ Kernel.Float(a.to_s)
185
+ rescue ArgumentError, TypeError
186
+ false
187
+ else
188
+ true
189
+ end
190
+ end
191
+
192
+ validates_from(attrs, options, closure)
193
+ end
194
+
195
+ ################################################################################
196
+ # If using acts_as_array that returns links, send them to another class
197
+ def self.to (other_class)
198
+ ToProxy.new(self, other_class)
199
+ end
200
+
201
+ ################################################################################
202
+ # Called by the crawler to process a web page
203
+ def self.extract (data, uri, session, &block)
204
+ obj = process_page(data, uri, session)
205
+
206
+ if meta_eval {@paginated}
207
+ if obj.respond_to?(:next_page)
208
+ sister = obj
209
+
210
+ while sister_uri = sister.next_page
211
+ sister = extract_sister(session, obj, sister_uri)
212
+ end
213
+ elsif obj.respond_to?(:link_for_page)
214
+ (2 .. obj.pages).each do |page|
215
+ sister_uri = obj.link_for_page(page)
216
+ extract_sister(session, obj, sister_uri)
217
+ end
218
+ end
219
+ end
220
+
221
+ as_array = meta_eval {@as_array}
222
+ obj = obj.send(as_array) if as_array
223
+
224
+ return obj unless block
225
+ obj.respond_to?(:each) ? obj.each {|o| yield(o)} : yield(obj)
226
+ end
227
+
228
+ ################################################################################
229
+ # Have a chance to do something after parsing, but before validataion
230
+ def after_parse
231
+ end
232
+
233
+ ################################################################################
234
+ # Called by the extract method to validate scraped data. If you override this
235
+ # method, you should call super. This method will probably be changed in the
236
+ # future so that you don't have to call super.
237
+ def validate
238
+ validations = self.class.meta_eval { @validations }
239
+
240
+ validations.each do |v|
241
+ raise "#{self.class}.#{v[:name]} #{v[:options][:message]}" unless
242
+ v[:proc].call(send(v[:name]))
243
+ end
244
+
245
+ self
246
+ end
247
+
248
+ ################################################################################
249
+ protected
250
+
251
+ ################################################################################
252
+ # Called by extract to process a page object
253
+ def self.process_page (data, uri, session)
254
+ if file = meta_eval { @with_xslt }
255
+ options = "--html '#{file}' -"
256
+
257
+ open("|#{XSLTPROC} #{options} 2> /dev/null", 'w+') do |xsltproc|
258
+ xsltproc << data
259
+ xsltproc.close_write
260
+ data = xsltproc.read
261
+ end
262
+ end
263
+
264
+ obj = parse(Hpricot(data))
265
+ obj.uri = uri
266
+ obj.session = session
267
+ obj.after_parse
268
+ obj.validate
269
+ obj
270
+ end
271
+
272
+ ################################################################################
273
+ # Called by extract to process paginated objects
274
+ def self.extract_sister (session, obj, sister_uri)
275
+ res = session.crawler.fetch(sister_uri)
276
+ sister = process_page(res.body, sister_uri, session)
277
+ obj.append_page(sister)
278
+ sister
279
+ end
280
+
281
+ ################################################################################
282
+ private
283
+
284
+ ################################################################################
285
+ # Add some things to sub-classes
286
+ def self.inherited (klass)
287
+ klass.meta_eval do
288
+ @validations = []
289
+ @paginated = false
290
+ @as_array = false
291
+ end
292
+ end
293
+
294
+ ################################################################################
295
+ # generic way to add validation
296
+ def self.validates_from (attrs, options, closure)
297
+ meta_eval do
298
+ attrs.each do |a|
299
+ @validations << {
300
+ :name => a,
301
+ :options => options,
302
+ :proc => closure,
303
+ }
304
+ end
305
+ end
306
+ end
307
+
308
+ ################################################################################
309
+ # helper to correctly parse the validate calls
310
+ def self.attrs_options (attrs, options)
311
+ ops = attrs.pop if attrs.last.is_a?(Hash)
312
+ options.update(ops) if ops
313
+ [attrs, options]
314
+ end
315
+
316
+ end
317
+ ################################################################################
318
+ end
319
+ ################################################################################