scrapes 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,327 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ #--
26
+ # This started as a branch of the uformatparser lib by:
27
+ # Author:: Assaf Arkin assaf@labnotes.org
28
+ # Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
29
+ # Copyright:: Copyright (c) 2005 Assaf Arkin
30
+ # License:: Creative Commons Attribution-ShareAlike
31
+ # Rewrite and Hpricot support by Michael Garriss
32
+ #++
33
+ ################################################################################
34
+ require 'yaml'
35
+ require 'scrapes/hpricot'
36
+ ################################################################################
37
+ module Scrapes
38
+ ################################################################################
39
+ # The methods defined here are available at the class scope level of a Scrapes::Page
40
+ # subclass. For example:
41
+ # class Foobar < Scrapes::Page
42
+ # rule :foo, 'foo'
43
+ # rule_1 :bar, 'bar', 'text()'
44
+ # end
45
+ #--
46
+ # === Using <tt>rule</tt>
47
+ # === Using <tt>rule_1</tt>
48
+ # === Using <tt>selector</tt>
49
+ # === Using <tt>extractor</tt>
50
+ #++
51
+ module RuleParser
52
+ ################################################################################
53
+ # name:: the name later used to invoke this rule
54
+ # select:: the selector to use, String or Symbol
55
+ # extract:: the extractor to use, String, Symbol, or Class. See RuleParser#extractor
56
+ # limit:: the limit of nodes to send to extractor
57
+ # block:: a block extractor, must not be defined if extract is non-nil
58
+ # Example:
59
+ # class Foobar < Scrapes::Page
60
+ # rule :foo, 'foo'
61
+ # end
62
+ # Later it's used as an instance method on the Scrapes::Page objects like this:
63
+ # foobar.foo.each do |foo|
64
+ # example.attr << foo
65
+ # end
66
+ def rule(name, select = '', extract = nil, limit = -1, &block)
67
+ raise InvalidRuleException, "First argument (rule name) is required" unless name
68
+ attr name, true
69
+ self.rules << Rule.new(name, selector(nil,select), extractor(nil,extract,&block), limit)
70
+ end
71
+
72
+ ################################################################################
73
+ # Almost the same as rule except forces limit to be 1. The other difference is
74
+ # that RuleParser#rule returns collections of mathes (an Array or size 1 even) where as
75
+ # RuleParser#rule_1 just returns the match.
76
+ # name:: the name later used to invoke this rule
77
+ # select:: the selector to use, String or Symbol
78
+ # extract:: the extractor to use, String, Symbol, or Class
79
+ # block:: a block extractor, must not be defined if extract is non-nil
80
+ # Example:
81
+ # class Foobar < Scrapes::Page
82
+ # rule_1 :bar, 'tr'
83
+ # end
84
+ # Later it's used as an instance method on the Scrapes::Page objects like this:
85
+ # example.attr = foobar.bar
86
+ def rule_1(name, selector = '', extractor = nil, &block)
87
+ rule(name, selector, extractor, 1, &block)
88
+ end
89
+
90
+ ################################################################################
91
+ # Creates a standalone selector that can later be used in a rule. Example:
92
+ # class Foobar < Scrapes::Page
93
+ # selector :foo_select, 'table'
94
+ # rule_1 :bar, :foo_select # a Symbol triggers use of the selector
95
+ # end
96
+ # name:: the name later used to invoke this selector
97
+ # select:: the selector to use, String or NilClass
98
+ # block:: a block selector, must not be defined if select is non-nil
99
+ # A block selector is yielded the Hpricot doc object just once. The collection it
100
+ # returns is interated over and each match is passed to the extractor. Example:
101
+ # class Foobar < Scrapes::Page
102
+ # selector :foo_select_2 do |hpricot_doc|
103
+ # doc.search('table')
104
+ # end
105
+ # rule_1 :bar, :foo_select_2 # a Symbol triggers use of the selector
106
+ # end
107
+ # String selectors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted as Hpricot
108
+ # search strings. See http://code.whytheluckystiff.net/hpricot/wiki/AnHpricotShowcase
109
+ def selector(name, select = nil, &block)
110
+ tor '@selector', name, select, &block
111
+ end
112
+
113
+ ################################################################################
114
+ # Creates a standalone extractor that can later be used in a rule. Example:
115
+ # class Foobar < Scrapes::Page
116
+ # extractor :mailto_extract do |elem|
117
+ # elem.attributes['href'].sub(/mailto:/,'') # remove the mailto: string
118
+ # end
119
+ # rule :emails, 'a[@href^="mailto:"]', :mailto_extract
120
+ # end
121
+ # name:: the name later used to invoke this selector
122
+ # extract:: the extractor to use, String or NilClass
123
+ # block:: a block extractor, must not be defined if extract is non-nil
124
+ # A block extractor is yielded each object that matched the rules's selector.
125
+ #
126
+ # Extractors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted based on
127
+ # the class of the extractor as follows
128
+ # ==== NilClass
129
+ # The result of the selector is just re-returned. Thus <tt>foo.my_rule</tt> would
130
+ # just return the selector results defined on the :my_rule rule.
131
+ # ==== Symbol
132
+ # An custom extractor is used. See above docs for this method for an example.
133
+ # ==== Class
134
+ # A nested class of the given name is used as a new inner-parser. An instance of that
135
+ # class is returned from each invocation of the extractor. Example:
136
+ # class Outer < Scrapes::Page
137
+ # class Inner < Scrapes::Page
138
+ # rule_1 :bold_text, 'b', 'text()'
139
+ # rule_1 :img_src, 'img[@src]', '@src'
140
+ # end
141
+ # rule :items, 'tr', Inner
142
+ # end
143
+ # Now calling <tt>my_page.items</tt> returns an Array of Inner objects that each
144
+ # separately parses out the bold text and image source of each table row in the
145
+ # document.
146
+ # ==== String
147
+ # Two patterns:
148
+ # @foobar:: extract out the contents of an attibute named 'foobar'
149
+ # foobar():: invoke the foobar builtin extractor, see Scrapes::Hpricot::Extractors
150
+ def extractor(name, extract = nil, &block)
151
+ tor '@extractor', name, extract, &block
152
+ end
153
+
154
+ ################################################################################
155
+ def parse(node, context = nil, rules = nil) # :nodoc:
156
+ context = self.new() unless context
157
+ rules = self.rules unless rules
158
+ if rules
159
+ rules.each_with_index do |rule, index|
160
+ if rule and rule.process(node, context)
161
+ less_rules = rules.clone unless less_rules
162
+ less_rules[index] = nil
163
+ end
164
+ end
165
+ end
166
+ context
167
+ end
168
+
169
+ ################################################################################
170
+ def rules() # :nodoc:
171
+ @microparser_rules ||= []
172
+ end
173
+
174
+ private
175
+
176
+ ################################################################################
177
+ def tor(type, name, tor_arg = nil, &block)
178
+ raise InvalidRuleException, "can't use both arg and block" if tor_arg and block
179
+ result = case (tor_arg ||= block)
180
+ when NilClass then proc {|node| node}
181
+ when String
182
+ if type == '@selector'
183
+ proc {|node| node.search(tor_arg)}
184
+ else
185
+ Extractor.new self, tor_arg
186
+ end
187
+ when Proc, Method then tor_arg
188
+ when Symbol then proc {|node| send(tor_arg,node) }
189
+ when Class
190
+ begin
191
+ tor_arg.method(:parse)
192
+ rescue NameError=>error
193
+ raise InvalidRuleException,
194
+ "Selector class must implement the method parse", error.backtrace
195
+ end
196
+ tor_arg
197
+ else
198
+ raise InvalidRuleException,
199
+ "Invalid tor type: must be a string, parser class, block or nil"
200
+ end
201
+ # TODO dry
202
+ if type == "@selector"
203
+ self.class.class_eval { (@selector ||= {})[name] = result }
204
+ class_def(name) do |node|
205
+ self.class.class_eval { @selector[name].call(node) }
206
+ end if name
207
+ else
208
+ self.class.class_eval { (@extractor ||= {})[name] = result }
209
+ class_def(name) do |node|
210
+ self.class.class_eval { @extractor[name].call(node) }
211
+ end if name
212
+ end
213
+ result
214
+ end
215
+
216
+ ################################################################################
217
+ def self.included(mod) # :nodoc:
218
+ mod.extend(self)
219
+ mod.extend(Scrapes::Hpricot::Extractors)
220
+ end
221
+
222
+ ################################################################################
223
+ class Rule #:nodoc:all
224
+ attr :name
225
+ attr :limit,true
226
+ attr :selector
227
+ attr :extractor
228
+
229
+ ################################################################################
230
+ def initialize(name, selector, extractor, limit)
231
+ @name, @selector, @extractor, @limit = name.to_s.intern, selector, extractor, limit
232
+ end
233
+
234
+ ################################################################################
235
+ def process(node, context)
236
+ context.instance_variable_set '@hpricot', node
237
+ return true if @limit == 0
238
+ result = @selector.call(node)
239
+ result = [result] unless result.respond_to? :each
240
+ current = context.instance_variable_set "@#@name", []
241
+ result.compact.each do |node|
242
+ value = case @extractor
243
+ when UnboundMethod then @extractor.bind(context).call(node)
244
+ when Extractor then @extractor.extract(node)
245
+ when Proc, Method then @extractor.call(node)
246
+ when Class then @extractor.parse(node)
247
+ end
248
+ next unless value
249
+ current << value
250
+ break if current.size == @limit
251
+ end
252
+ context.instance_variable_set "@#@name", current[0] if @limit == 1
253
+ true
254
+ end
255
+
256
+ ################################################################################
257
+ def inspect
258
+ @selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
259
+ end
260
+ end
261
+
262
+ ################################################################################
263
+ class Extractor # :nodoc:all
264
+ # TODO review this
265
+ # Parse each extractor into three parts:
266
+ # $1 function name (excluding parentheses)
267
+ # $2 element name
268
+ # $3 attribute name (including leading @)
269
+ # If a match is found the result is either $1, or $2 and/or $3
270
+ REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
271
+
272
+ ################################################################################
273
+ def initialize(context, statement) # :nodoc:
274
+ statement.strip!
275
+ @extracts = []
276
+ statement.split('|').each do |extract|
277
+ parts = REGEX.match(extract)
278
+ if parts[1]
279
+ begin
280
+ @extracts << context.method(parts[1])
281
+ rescue NameError=>error
282
+ raise InvalidRuleException, error.message, error.backtrace
283
+ end
284
+ elsif parts[2] and parts[3]
285
+ attr_name = parts[3][1..-1]
286
+ @extracts << proc do |node|
287
+ node.attributes[attr_name] if node.name == parts[2]
288
+ end
289
+ elsif parts[2]
290
+ @extracts << proc { |node| text(node) if node.name == parts[2] }
291
+ elsif parts[3]
292
+ attr_name = parts[3][1..-1]
293
+ @extracts << proc do |node|
294
+ if node.respond_to? :each
295
+ node.all.attributes.all[attr_name]
296
+ else
297
+ node.attributes[attr_name]
298
+ end
299
+ end
300
+ else
301
+ raise InvalidRuleException, "Invalid extraction statement"
302
+ end
303
+ end
304
+ raise InvalidRuleException, "Invalid (empty) extraction statement" if
305
+ @extracts.size == 0
306
+ end
307
+
308
+ ################################################################################
309
+ def extract(node) # :nodoc:
310
+ value = nil
311
+ @extracts.find do |extract|
312
+ value = extract.call(node)
313
+ end
314
+ value
315
+ end
316
+
317
+ ################################################################################
318
+ def inspect() # :nodoc:
319
+ @extracts.join('|')
320
+ end
321
+ end
322
+
323
+ ################################################################################
324
+ class InvalidRuleException < Exception # :nodoc:all
325
+ end
326
+ end
327
+ end
@@ -0,0 +1,155 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'scrapes/crawler'
26
+
27
+ module Scrapes
28
+ ################################################################################
29
+ # Session is used to process all web pages under a single session. This may
30
+ # be necessary when some web sites need you to login, or otherwise create
31
+ # a session ID with a cookie before you can continue processing pages.
32
+ class Session
33
+ ################################################################################
34
+ attr_reader :log
35
+
36
+ ################################################################################
37
+ attr_accessor :post
38
+
39
+ ################################################################################
40
+ attr_accessor :timeout
41
+
42
+ ################################################################################
43
+ attr_accessor :cookies
44
+
45
+ ################################################################################
46
+ attr_reader :uri
47
+
48
+ ################################################################################
49
+ attr_reader :crawler
50
+
51
+ ################################################################################
52
+ attr_reader :base_uris
53
+
54
+ ################################################################################
55
+ # Start a session using a HTTP GET
56
+ def self.from_get (uri, &block)
57
+ session = self.new
58
+ session.uri = uri
59
+ block ? yield(session) : session
60
+ end
61
+
62
+ ################################################################################
63
+ # Start a session using HTTP POST
64
+ def self.from_post (uri, post, &block)
65
+ session = self.new
66
+ session.uri = uri
67
+ session.post = post
68
+ block ? yield(session) : session
69
+ end
70
+
71
+ ################################################################################
72
+ # Start a session witout having to create a session with the web site first.
73
+ def self.start (log=nil,&block)
74
+ session = self.new(log)
75
+ block ? yield(session) : session
76
+ end
77
+
78
+ ################################################################################
79
+ def initialize log = nil
80
+ @uri = nil
81
+ @post = {}
82
+ @when = Time.at(0)
83
+ @timeout = 900
84
+ @cookies = Cookies.new
85
+ @base_uris = []
86
+ @crawler = Crawler.new(self)
87
+ @crawler.log = @log = log
88
+ @refreshing = false
89
+ end
90
+
91
+ ################################################################################
92
+ def uri= (uri)
93
+ @uri = uri
94
+ @base_uris << uri
95
+ end
96
+
97
+ ################################################################################
98
+ # Process a web page
99
+ def page (page_class, link, post={}, &block)
100
+ return if link.nil?
101
+ link = [link] unless link.respond_to?(:to_ary)
102
+ block ||= lambda {|data| data}
103
+ result = nil
104
+
105
+ link.each do |u|
106
+ fetch(u, post) do |res|
107
+ result = page_class.extract(res.body, u, self, &block)
108
+ end
109
+ end
110
+
111
+ result
112
+ end
113
+
114
+ ################################################################################
115
+ # Fetch a URL in the session, but without a Scrapes::Page
116
+ def fetch (uri, post={}, &block)
117
+ u = absolute_uri(uri)
118
+ @base_uris.push(u)
119
+ yield(@crawler.fetch(u, post))
120
+ @base_uris.pop
121
+ end
122
+
123
+ ################################################################################
124
+ # Refresh the session, sometimes necessary when you are getting pages out of the
125
+ # cache, but then go to the real web site and the session has expired.
126
+ def refresh
127
+ if !@refreshing and @uri and (Time.now - @when) > @timeout
128
+ begin
129
+ @refreshing = true
130
+ @when = Time.now
131
+ @cookies.clear
132
+
133
+ @crawler.cache.without_cache do
134
+ @crawler.fetch(uri, post)
135
+ end
136
+ ensure
137
+ @refreshing = false
138
+ end
139
+ end
140
+
141
+ self
142
+ end
143
+
144
+ ################################################################################
145
+ # Convert a relative URI to an absolute URI
146
+ def absolute_uri (uri)
147
+ return uri if @base_uris.empty?
148
+ base = URI.parse(@base_uris.last)
149
+ base.merge(uri).to_s
150
+ end
151
+
152
+ end
153
+ ################################################################################
154
+ end
155
+ ################################################################################