scrapes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ module Scrapes
26
+ ################################################################################
27
+ # Make it easy to access HTTP cookies
28
+ class Cookies < Hash
29
+ ################################################################################
30
+ # Convert the current set of cookies into HTTP headers.
31
+ def to_header
32
+ map {|k,v| "#{k}=#{v}"}.join(';')
33
+ end
34
+
35
+ ################################################################################
36
+ # Parse HTTP cookie headers
37
+ def from_header (header)
38
+ k, v = header.sub(/;.*$/, '').split(/\s*=\s*/, 2)
39
+ self[k] = v
40
+ end
41
+
42
+ end
43
+ ################################################################################
44
+ end
45
+ ################################################################################
@@ -0,0 +1,97 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'net/http'
26
+ require 'pathname'
27
+ require 'scrapes/cache'
28
+ ################################################################################
29
+ module Scrapes
30
+ ################################################################################
31
+ # Try to suck down a URI
32
+ class Crawler
33
+ ################################################################################
34
+ # The cache object that this crawler is using
35
+ attr_accessor :cache
36
+
37
+ ################################################################################
38
+ # The optional log object that this crawler is using
39
+ attr_accessor :log
40
+
41
+ ################################################################################
42
+ # Create a new crawler for the given session
43
+ def initialize (session)
44
+ @session = session
45
+ @log = nil
46
+ @verbose = 0
47
+ @delay = 0.5
48
+ @cache = Cache.new
49
+ end
50
+
51
+ ################################################################################
52
+ # Fetch a URI, using HTTP GET unless you supply <tt>post</tt>.
53
+ def fetch (uri, post={}, headers={})
54
+ @session.refresh
55
+ uri = URI.parse(@session.absolute_uri(uri))
56
+
57
+ post.empty? and cached = @cache.check(uri)
58
+ @log.info((cached ? 'C ' : 'N ') + uri.to_s) if @log
59
+
60
+ return cached if cached # FIXME
61
+ sleep(@delay) if @delay != 0
62
+
63
+ path = uri.path.dup
64
+ path << "/" if path.empty?
65
+ path << "?" + uri.query if uri.query
66
+
67
+ req = post.empty? ? Net::HTTP::Get.new(path) : Net::HTTP::Post.new(path)
68
+ req.set_form_data(post) unless post.empty?
69
+
70
+ req['Cookie'] = @session.cookies.to_header
71
+ headers.each {|k,v| req[k] = v}
72
+
73
+ res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req)}
74
+
75
+ if @verbose >= 2
76
+ STDERR.puts "-----------------------------------------------"
77
+ STDERR.puts res.class
78
+ res.each_header {|k,v| STDERR.puts "#{k}: #{v}"}
79
+ end
80
+
81
+ # FIXME, what to do about more than one cookie
82
+ @session.cookies.from_header(res['set-cookie']) if res.key?('set-cookie')
83
+
84
+ case res
85
+ when Net::HTTPRedirection
86
+ @session.base_uris[-1] = @session.absolute_uri(res['location'])
87
+ res = fetch(res['location'], {}, headers)
88
+ end
89
+
90
+ post.empty? and @cache.update(uri, res.body)
91
+ res
92
+ end
93
+
94
+ end
95
+ ################################################################################
96
+ end
97
+ ################################################################################
@@ -0,0 +1,110 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'cgi'
26
+ require 'hpricot'
27
+ ################################################################################
28
+ module Scrapes
29
+ ################################################################################
30
+ module Hpricot # :nodoc:
31
+ ################################################################################
32
+ module Extractors
33
+ ################################################################################
34
+ # Returns the text of any child text nodes recursively concatenated.
35
+ def text(node)
36
+ text_process(node,String) do |e| text(e) end
37
+ end
38
+
39
+ ################################################################################
40
+ # Returns the text of any child text nodes recursively as nested Array.
41
+ def texts(node)
42
+ text_process(node,Array) do |e| texts(e) end
43
+ end
44
+
45
+ ################################################################################
46
+ # Returns the text of any child text nodes concatenated.
47
+ def content(node)
48
+ text_process(node,String) do |e| e.content end
49
+ end
50
+
51
+ ################################################################################
52
+ # Returns the text of any child text nodes as an Array.
53
+ def contents(node)
54
+ text_process(node,Array) do |e| e.content end
55
+ end
56
+
57
+ ################################################################################
58
+ # The result of text() with whitespace reduceded to single spaces and striped.
59
+ def word(node)
60
+ text_process(node,String) do |e| word(e).gsub(/\s+/,' ').strip end
61
+ end
62
+
63
+ ################################################################################
64
+ # The result of texts() striped, flattened, whitespace reduced to single spaces, and
65
+ # with all blank?s rejected.
66
+ def words(node)
67
+ texts(node).flatten.compact.map{|e|e.gsub(/\s+/,' ').strip}.reject{|e| e.blank?}
68
+ end
69
+
70
+ ################################################################################
71
+ # Just reuturn the yielded node.
72
+ def xml(node)
73
+ node
74
+ end
75
+
76
+ protected
77
+ ################################################################################
78
+ def unescape
79
+ case result = yield
80
+ when String then CGI::unescapeHTML(result).gsub('&nbsp;', ' ')
81
+ when Array then result.map{|e| Extractors::unescape{e}}
82
+ when NilClass then nil
83
+ else raise "should be Array or String, was: #{result.class}"
84
+ end
85
+ end
86
+ ################################################################################
87
+ def text_process(node, klass, &block)
88
+ Extractors::unescape do
89
+ case node
90
+ when Array, ::Hpricot::Elements
91
+ node.map do |elem|
92
+ text_process(elem,klass,&block)
93
+ end
94
+ when ::Hpricot::Elem, ::Hpricot::Doc
95
+ node.children.inject(klass.new) do |value,child|
96
+ (value << block.call(child)) rescue nil
97
+ value
98
+ end
99
+ when ::Hpricot::Text then node.content
100
+ end
101
+ end
102
+ end
103
+
104
+ module_function :word, :words, :text, :texts, :content, :contents, :text_process
105
+ end
106
+ ################################################################################
107
+ end
108
+ ################################################################################
109
+ end
110
+ ################################################################################
@@ -0,0 +1,86 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ module Scrapes
26
+ ################################################################################
27
+ # Initialize the Scrapes library
28
+ class Initializer
29
+ ################################################################################
30
+ # The directory name where the pages classes are kept
31
+ attr_accessor :pages_dir
32
+
33
+ ################################################################################
34
+ # The parent directory where the pages_dir can be found
35
+ attr_accessor :pages_parent
36
+
37
+ ################################################################################
38
+ # Create a new Initializer and run it
39
+ def self.run (&block)
40
+ initializer = self.new
41
+ yield initializer if block
42
+ initializer
43
+ end
44
+
45
+ ################################################################################
46
+ # Establish all the defaults
47
+ def initialize
48
+ @pages_dir = 'pages'
49
+ @pages_parent = File.dirname($0)
50
+ end
51
+
52
+ ################################################################################
53
+ # Run all the initilization methods
54
+ def process
55
+ load_pages
56
+ end
57
+
58
+ ################################################################################
59
+ private
60
+
61
+ ################################################################################
62
+ # load all files in the pages directory
63
+ def load_pages
64
+ reloader(Dir.glob(@pages_parent + '/' + @pages_dir + '/*.rb').sort)
65
+ end
66
+
67
+ ################################################################################
68
+ # try to keep loading files until all NameError issues are resolved
69
+ def reloader (files, limit=4)
70
+ reload = []
71
+
72
+ files.each do |file|
73
+ begin
74
+ load File.expand_path(file)
75
+ rescue NameError
76
+ raise if limit <= 0
77
+ reload << file
78
+ end
79
+ end
80
+
81
+ reloader(reload, limit - 1) unless reload.empty?
82
+ end
83
+
84
+ end
85
+ end
86
+ ################################################################################
@@ -0,0 +1,319 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'scrapes/rule_parser'
26
+ require 'hpricot'
27
+ require 'rextra'
28
+ ################################################################################
29
+ module Scrapes
30
+ ################################################################################
31
+ # The page class is used as a base class for scraping data out of one web
32
+ # page. To use it, you inherit from it and setup some rules. You can also
33
+ # use validators to ensure that the page was scraped correctly.
34
+ #
35
+ # == Setup
36
+ #
37
+ # class MyPageScraper < Scrapes::Page
38
+ # rule :rule_name, blah
39
+ # end
40
+ # Scrapes::RuleParser explains the use of rules.
41
+ #
42
+ # == Auto Loading
43
+ #
44
+ # Scrapes will automatically 'require' ruby files placed in a special 'pages' directory.
45
+ # The idea is to place one Scrapes::Page derived class per file in the pages directory,
46
+ # and have it required for you.
47
+ #
48
+ # == Validations
49
+ #
50
+ # There are a few class methods that you can use to validate the contents you scraped
51
+ # from a given web page.
52
+ class Page
53
+ include Scrapes::Hpricot::Extractors
54
+
55
+ XSLTPROC = 'xsltproc' # :nodoc
56
+
57
+ ################################################################################
58
+ # RuleParser is used to extract data from web pages using CSS selectors
59
+ # and raw element access by using procs.
60
+ include RuleParser
61
+
62
+ ################################################################################
63
+ # Access the URI where this page's data came from
64
+ attr_accessor :uri
65
+
66
+ ################################################################################
67
+ # Access the session object that was used to fetch this page's data
68
+ attr_accessor :session
69
+
70
+ ################################################################################
71
+ # Access the Hpricot object that the selectors are passed
72
+ attr_accessor :hpricot
73
+
74
+ ################################################################################
75
+ # If the page that you are parsing is paginated (one page in many of similar data)
76
+ # you can use this class method to automatically fetch all pages. In order for this
77
+ # to work, you need to provide a few special methods:
78
+ #
79
+ # === Next Page
80
+ #
81
+ # If you know the URL to the next page, then provide a instance method called
82
+ # <tt>next_page</tt>. It should return the URL for the next page, or nil when
83
+ # the current page is the last page.
84
+ #
85
+ # class NextPageExample < Scrapes::Page
86
+ # rule(:next_page, 'a[href~=next]', '@href', 1)
87
+ # end
88
+ #
89
+ # === Link for Page
90
+ #
91
+ # Alternatively, you can provide a instance method <tt>link_for_page</tt> and
92
+ # another one called <tt>pages</tt>. The <tt>pages</tt> method should return the
93
+ # number of pages in this paginated set. The <tt>link_for_page</tt> method should
94
+ # take a page number, and return a URL to fetch that page.
95
+ #
96
+ # class LinkForPageExample < Scrapes::Page
97
+ # rule_1(:page) {|e| m = e.text.match(/Page\s+\d+\s+of\s+(\d+)/) and m[1].to_i}
98
+ #
99
+ # def link_for_page (page)
100
+ # uri.sub(/page=\d+/, "page=#{page}")
101
+ # end
102
+ # end
103
+ #
104
+ # === Append to Page
105
+ #
106
+ # Finally, you must provide a <tt>append_page</tt> method. It takes an instance
107
+ # of your Scrapes::Page derived class as an argument. Its job is to add the data
108
+ # found on the current page to its instance variables. This is because when you use
109
+ # paginated, it only returns one instance of your class.
110
+ def self.paginated
111
+ meta_eval { @paginated = true }
112
+ end
113
+
114
+ ################################################################################
115
+ # Make Page.extract return an array by calling the given method. This can be
116
+ # very useful for when your class does nothing more than collect a set of links
117
+ # for some other page to process. It cases Session#page to call the given block
118
+ # once for each object returned from method_to_call.
119
+ def self.acts_as_array (method_to_call)
120
+ meta_eval { @as_array = method_to_call }
121
+ end
122
+
123
+ ################################################################################
124
+ # Preprocess the HTML by sending it through an XSLT stylesheet. The stylesheet
125
+ # should return a document that can be then processed using your rules. Using
126
+ # this feature requires that you have the xsltproc utility in your PATH.
127
+ # You can get xsltproc from libxslt: http://xmlsoft.org/XSLT/
128
+ def self.with_xslt (filename)
129
+ raise "#{XSLTPROC} could not be found" unless `#{XSLTPROC} --version 2>&1`.match(/libxslt/)
130
+ meta_eval { @with_xslt = filename }
131
+ end
132
+
133
+ ################################################################################
134
+ # Ensure that the given attributes have been set by matching rules
135
+ def self.validates_presence_of (*attrs)
136
+ attrs, options = attrs_options(attrs, {
137
+ :message => 'rule never matched',
138
+ })
139
+
140
+ validates_from(attrs, options, lambda {|a| !a.nil?})
141
+ end
142
+
143
+ ################################################################################
144
+ # Ensure that the given attributes are not #blank?
145
+ def self.validates_not_blank (*attrs)
146
+ attrs, options = attrs_options(attrs, {
147
+ :message => 'rule never matched',
148
+ })
149
+
150
+ validates_from(attrs, options, lambda {|a| !a.blank?})
151
+ end
152
+
153
+ ################################################################################
154
+ # Ensure that the given attributes have the correct format
155
+ def self.validates_format_of (*attrs)
156
+ attrs, options = attrs_options(attrs, {
157
+ :message => 'did not match regular expression',
158
+ :with => /.*/,
159
+ })
160
+
161
+ validates_from(attrs, options, lambda {|a| a.to_s.match(options[:with])})
162
+ end
163
+
164
+ ################################################################################
165
+ # Ensure that the given attributes have values in the given list
166
+ def self.validates_inclusion_of (*attrs)
167
+ attrs, options = attrs_options(attrs, {
168
+ :message => 'is not in the list of accepted values',
169
+ :in => [],
170
+ })
171
+
172
+ validates_from(attrs, options, lambda {|a| options[:in].include?(a)})
173
+ end
174
+
175
+ ################################################################################
176
+ # Ensure that the given attribute is a number
177
+ def self.validates_numericality_of (*attrs)
178
+ attrs, options = attrs_options(attrs, {
179
+ :message => 'is not a number',
180
+ })
181
+
182
+ closure = lambda do |a|
183
+ begin
184
+ Kernel.Float(a.to_s)
185
+ rescue ArgumentError, TypeError
186
+ false
187
+ else
188
+ true
189
+ end
190
+ end
191
+
192
+ validates_from(attrs, options, closure)
193
+ end
194
+
195
+ ################################################################################
196
+ # If using acts_as_array that returns links, send them to another class
197
+ def self.to (other_class)
198
+ ToProxy.new(self, other_class)
199
+ end
200
+
201
+ ################################################################################
202
+ # Called by the crawler to process a web page
203
+ def self.extract (data, uri, session, &block)
204
+ obj = process_page(data, uri, session)
205
+
206
+ if meta_eval {@paginated}
207
+ if obj.respond_to?(:next_page)
208
+ sister = obj
209
+
210
+ while sister_uri = sister.next_page
211
+ sister = extract_sister(session, obj, sister_uri)
212
+ end
213
+ elsif obj.respond_to?(:link_for_page)
214
+ (2 .. obj.pages).each do |page|
215
+ sister_uri = obj.link_for_page(page)
216
+ extract_sister(session, obj, sister_uri)
217
+ end
218
+ end
219
+ end
220
+
221
+ as_array = meta_eval {@as_array}
222
+ obj = obj.send(as_array) if as_array
223
+
224
+ return obj unless block
225
+ obj.respond_to?(:each) ? obj.each {|o| yield(o)} : yield(obj)
226
+ end
227
+
228
+ ################################################################################
229
+ # Have a chance to do something after parsing, but before validataion
230
+ def after_parse
231
+ end
232
+
233
+ ################################################################################
234
+ # Called by the extract method to validate scraped data. If you override this
235
+ # method, you should call super. This method will probably be changed in the
236
+ # future so that you don't have to call super.
237
+ def validate
238
+ validations = self.class.meta_eval { @validations }
239
+
240
+ validations.each do |v|
241
+ raise "#{self.class}.#{v[:name]} #{v[:options][:message]}" unless
242
+ v[:proc].call(send(v[:name]))
243
+ end
244
+
245
+ self
246
+ end
247
+
248
+ ################################################################################
249
+ protected
250
+
251
+ ################################################################################
252
+ # Called by extract to process a page object
253
+ def self.process_page (data, uri, session)
254
+ if file = meta_eval { @with_xslt }
255
+ options = "--html '#{file}' -"
256
+
257
+ open("|#{XSLTPROC} #{options} 2> /dev/null", 'w+') do |xsltproc|
258
+ xsltproc << data
259
+ xsltproc.close_write
260
+ data = xsltproc.read
261
+ end
262
+ end
263
+
264
+ obj = parse(Hpricot(data))
265
+ obj.uri = uri
266
+ obj.session = session
267
+ obj.after_parse
268
+ obj.validate
269
+ obj
270
+ end
271
+
272
+ ################################################################################
273
+ # Called by extract to process paginated objects
274
+ def self.extract_sister (session, obj, sister_uri)
275
+ res = session.crawler.fetch(sister_uri)
276
+ sister = process_page(res.body, sister_uri, session)
277
+ obj.append_page(sister)
278
+ sister
279
+ end
280
+
281
+ ################################################################################
282
+ private
283
+
284
+ ################################################################################
285
+ # Add some things to sub-classes
286
+ def self.inherited (klass)
287
+ klass.meta_eval do
288
+ @validations = []
289
+ @paginated = false
290
+ @as_array = false
291
+ end
292
+ end
293
+
294
+ ################################################################################
295
+ # generic way to add validation
296
+ def self.validates_from (attrs, options, closure)
297
+ meta_eval do
298
+ attrs.each do |a|
299
+ @validations << {
300
+ :name => a,
301
+ :options => options,
302
+ :proc => closure,
303
+ }
304
+ end
305
+ end
306
+ end
307
+
308
+ ################################################################################
309
+ # helper to correctly parse the validate calls
310
+ def self.attrs_options (attrs, options)
311
+ ops = attrs.pop if attrs.last.is_a?(Hash)
312
+ options.update(ops) if ops
313
+ [attrs, options]
314
+ end
315
+
316
+ end
317
+ ################################################################################
318
+ end
319
+ ################################################################################