scrapes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,327 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ #--
26
+ # This started as a branch of the uformatparser lib by:
27
+ # Author:: Assaf Arkin assaf@labnotes.org
28
+ # Documentation:: http://trac.labnotes.org/cgi-bin/trac.cgi/wiki/Ruby/MicroformatParser
29
+ # Copyright:: Copyright (c) 2005 Assaf Arkin
30
+ # License:: Creative Commons Attribution-ShareAlike
31
+ # Rewrite and Hpricot support by Michael Garriss
32
+ #++
33
+ ################################################################################
34
+ require 'yaml'
35
+ require 'scrapes/hpricot'
36
+ ################################################################################
37
+ module Scrapes
38
+ ################################################################################
39
+ # The methods defined here are available at the class scope level of a Scrapes::Page
40
+ # subclass. For example:
41
+ # class Foobar < Scrapes::Page
42
+ # rule :foo, 'foo'
43
+ # rule_1 :bar, 'bar', 'text()'
44
+ # end
45
+ #--
46
+ # === Using <tt>rule</tt>
47
+ # === Using <tt>rule_1</tt>
48
+ # === Using <tt>selector</tt>
49
+ # === Using <tt>extractor</tt>
50
+ #++
51
+ module RuleParser
52
+ ################################################################################
53
+ # name:: the name later used to invoke this rule
54
+ # select:: the selector to use, String or Symbol
55
+ # extract:: the extractor to use, String, Symbol, or Class. See RuleParser#extractor
56
+ # limit:: the limit of nodes to send to extractor
57
+ # block:: a block extractor, must not be defined if extract is non-nil
58
+ # Example:
59
+ # class Foobar < Scrapes::Page
60
+ # rule :foo, 'foo'
61
+ # end
62
+ # Later it's used as an instance method on the Scrapes::Page objects like this:
63
+ # foobar.foo.each do |foo|
64
+ # example.attr << foo
65
+ # end
66
+ def rule(name, select = '', extract = nil, limit = -1, &block)
67
+ raise InvalidRuleException, "First argument (rule name) is required" unless name
68
+ attr name, true
69
+ self.rules << Rule.new(name, selector(nil,select), extractor(nil,extract,&block), limit)
70
+ end
71
+
72
+ ################################################################################
73
+ # Almost the same as rule except forces limit to be 1. The other difference is
74
+ # that RuleParser#rule returns collections of mathes (an Array or size 1 even) where as
75
+ # RuleParser#rule_1 just returns the match.
76
+ # name:: the name later used to invoke this rule
77
+ # select:: the selector to use, String or Symbol
78
+ # extract:: the extractor to use, String, Symbol, or Class
79
+ # block:: a block extractor, must not be defined if extract is non-nil
80
+ # Example:
81
+ # class Foobar < Scrapes::Page
82
+ # rule_1 :bar, 'tr'
83
+ # end
84
+ # Later it's used as an instance method on the Scrapes::Page objects like this:
85
+ # example.attr = foobar.bar
86
+ def rule_1(name, selector = '', extractor = nil, &block)
87
+ rule(name, selector, extractor, 1, &block)
88
+ end
89
+
90
+ ################################################################################
91
+ # Creates a standalone selector that can later be used in a rule. Example:
92
+ # class Foobar < Scrapes::Page
93
+ # selector :foo_select, 'table'
94
+ # rule_1 :bar, :foo_select # a Symbol triggers use of the selector
95
+ # end
96
+ # name:: the name later used to invoke this selector
97
+ # select:: the selector to use, String or NilClass
98
+ # block:: a block selector, must not be defined if select is non-nil
99
+ # A block selector is yielded the Hpricot doc object just once. The collection it
100
+ # returns is interated over and each match is passed to the extractor. Example:
101
+ # class Foobar < Scrapes::Page
102
+ # selector :foo_select_2 do |hpricot_doc|
103
+ # doc.search('table')
104
+ # end
105
+ # rule_1 :bar, :foo_select_2 # a Symbol triggers use of the selector
106
+ # end
107
+ # String selectors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted as Hpricot
108
+ # search strings. See http://code.whytheluckystiff.net/hpricot/wiki/AnHpricotShowcase
109
+ def selector(name, select = nil, &block)
110
+ tor '@selector', name, select, &block
111
+ end
112
+
113
+ ################################################################################
114
+ # Creates a standalone extractor that can later be used in a rule. Example:
115
+ # class Foobar < Scrapes::Page
116
+ # extractor :mailto_extract do |elem|
117
+ # elem.attributes['href'].sub(/mailto:/,'') # remove the mailto: string
118
+ # end
119
+ # rule :emails, 'a[@href^="mailto:"]', :mailto_extract
120
+ # end
121
+ # name:: the name later used to invoke this selector
122
+ # extract:: the extractor to use, String or NilClass
123
+ # block:: a block extractor, must not be defined if extract is non-nil
124
+ # A block extractor is yielded each object that matched the rules's selector.
125
+ #
126
+ # Extractors passed to <tt>rule</tt> or <tt>rule_1</tt> are interpreted based on
127
+ # the class of the extractor as follows
128
+ # ==== NilClass
129
+ # The result of the selector is just re-returned. Thus <tt>foo.my_rule</tt> would
130
+ # just return the selector results defined on the :my_rule rule.
131
+ # ==== Symbol
132
+ # An custom extractor is used. See above docs for this method for an example.
133
+ # ==== Class
134
+ # A nested class of the given name is used as a new inner-parser. An instance of that
135
+ # class is returned from each invocation of the extractor. Example:
136
+ # class Outer < Scrapes::Page
137
+ # class Inner < Scrapes::Page
138
+ # rule_1 :bold_text, 'b', 'text()'
139
+ # rule_1 :img_src, 'img[@src]', '@src'
140
+ # end
141
+ # rule :items, 'tr', Inner
142
+ # end
143
+ # Now calling <tt>my_page.items</tt> returns an Array of Inner objects that each
144
+ # separately parses out the bold text and image source of each table row in the
145
+ # document.
146
+ # ==== String
147
+ # Two patterns:
148
+ # @foobar:: extract out the contents of an attibute named 'foobar'
149
+ # foobar():: invoke the foobar builtin extractor, see Scrapes::Hpricot::Extractors
150
+ def extractor(name, extract = nil, &block)
151
+ tor '@extractor', name, extract, &block
152
+ end
153
+
154
+ ################################################################################
155
+ def parse(node, context = nil, rules = nil) # :nodoc:
156
+ context = self.new() unless context
157
+ rules = self.rules unless rules
158
+ if rules
159
+ rules.each_with_index do |rule, index|
160
+ if rule and rule.process(node, context)
161
+ less_rules = rules.clone unless less_rules
162
+ less_rules[index] = nil
163
+ end
164
+ end
165
+ end
166
+ context
167
+ end
168
+
169
+ ################################################################################
170
+ def rules() # :nodoc:
171
+ @microparser_rules ||= []
172
+ end
173
+
174
+ private
175
+
176
+ ################################################################################
177
+ def tor(type, name, tor_arg = nil, &block)
178
+ raise InvalidRuleException, "can't use both arg and block" if tor_arg and block
179
+ result = case (tor_arg ||= block)
180
+ when NilClass then proc {|node| node}
181
+ when String
182
+ if type == '@selector'
183
+ proc {|node| node.search(tor_arg)}
184
+ else
185
+ Extractor.new self, tor_arg
186
+ end
187
+ when Proc, Method then tor_arg
188
+ when Symbol then proc {|node| send(tor_arg,node) }
189
+ when Class
190
+ begin
191
+ tor_arg.method(:parse)
192
+ rescue NameError=>error
193
+ raise InvalidRuleException,
194
+ "Selector class must implement the method parse", error.backtrace
195
+ end
196
+ tor_arg
197
+ else
198
+ raise InvalidRuleException,
199
+ "Invalid tor type: must be a string, parser class, block or nil"
200
+ end
201
+ # TODO dry
202
+ if type == "@selector"
203
+ self.class.class_eval { (@selector ||= {})[name] = result }
204
+ class_def(name) do |node|
205
+ self.class.class_eval { @selector[name].call(node) }
206
+ end if name
207
+ else
208
+ self.class.class_eval { (@extractor ||= {})[name] = result }
209
+ class_def(name) do |node|
210
+ self.class.class_eval { @extractor[name].call(node) }
211
+ end if name
212
+ end
213
+ result
214
+ end
215
+
216
+ ################################################################################
217
+ def self.included(mod) # :nodoc:
218
+ mod.extend(self)
219
+ mod.extend(Scrapes::Hpricot::Extractors)
220
+ end
221
+
222
+ ################################################################################
223
+ class Rule #:nodoc:all
224
+ attr :name
225
+ attr :limit,true
226
+ attr :selector
227
+ attr :extractor
228
+
229
+ ################################################################################
230
+ def initialize(name, selector, extractor, limit)
231
+ @name, @selector, @extractor, @limit = name.to_s.intern, selector, extractor, limit
232
+ end
233
+
234
+ ################################################################################
235
+ def process(node, context)
236
+ context.instance_variable_set '@hpricot', node
237
+ return true if @limit == 0
238
+ result = @selector.call(node)
239
+ result = [result] unless result.respond_to? :each
240
+ current = context.instance_variable_set "@#@name", []
241
+ result.compact.each do |node|
242
+ value = case @extractor
243
+ when UnboundMethod then @extractor.bind(context).call(node)
244
+ when Extractor then @extractor.extract(node)
245
+ when Proc, Method then @extractor.call(node)
246
+ when Class then @extractor.parse(node)
247
+ end
248
+ next unless value
249
+ current << value
250
+ break if current.size == @limit
251
+ end
252
+ context.instance_variable_set "@#@name", current[0] if @limit == 1
253
+ true
254
+ end
255
+
256
+ ################################################################################
257
+ def inspect
258
+ @selector ? "[to #{@name} from #{@selector.inspect}, #{@extractor.inspect}, limit #{@limit}]" : "[to #{@name} from #{@extractor.inspect}, limit #{@limit}]"
259
+ end
260
+ end
261
+
262
+ ################################################################################
263
+ class Extractor # :nodoc:all
264
+ # TODO review this
265
+ # Parse each extractor into three parts:
266
+ # $1 function name (excluding parentheses)
267
+ # $2 element name
268
+ # $3 attribute name (including leading @)
269
+ # If a match is found the result is either $1, or $2 and/or $3
270
+ REGEX = /^(\w+)\(\)|([A-Za-z][A-Za-z0-9_\-:]*)?(@[A-Za-z][A-Za-z0-9_\-:]*)?$/
271
+
272
+ ################################################################################
273
+ def initialize(context, statement) # :nodoc:
274
+ statement.strip!
275
+ @extracts = []
276
+ statement.split('|').each do |extract|
277
+ parts = REGEX.match(extract)
278
+ if parts[1]
279
+ begin
280
+ @extracts << context.method(parts[1])
281
+ rescue NameError=>error
282
+ raise InvalidRuleException, error.message, error.backtrace
283
+ end
284
+ elsif parts[2] and parts[3]
285
+ attr_name = parts[3][1..-1]
286
+ @extracts << proc do |node|
287
+ node.attributes[attr_name] if node.name == parts[2]
288
+ end
289
+ elsif parts[2]
290
+ @extracts << proc { |node| text(node) if node.name == parts[2] }
291
+ elsif parts[3]
292
+ attr_name = parts[3][1..-1]
293
+ @extracts << proc do |node|
294
+ if node.respond_to? :each
295
+ node.all.attributes.all[attr_name]
296
+ else
297
+ node.attributes[attr_name]
298
+ end
299
+ end
300
+ else
301
+ raise InvalidRuleException, "Invalid extraction statement"
302
+ end
303
+ end
304
+ raise InvalidRuleException, "Invalid (empty) extraction statement" if
305
+ @extracts.size == 0
306
+ end
307
+
308
+ ################################################################################
309
+ def extract(node) # :nodoc:
310
+ value = nil
311
+ @extracts.find do |extract|
312
+ value = extract.call(node)
313
+ end
314
+ value
315
+ end
316
+
317
+ ################################################################################
318
+ def inspect() # :nodoc:
319
+ @extracts.join('|')
320
+ end
321
+ end
322
+
323
+ ################################################################################
324
+ class InvalidRuleException < Exception # :nodoc:all
325
+ end
326
+ end
327
+ end
@@ -0,0 +1,155 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'scrapes/crawler'
26
+
27
+ module Scrapes
28
+ ################################################################################
29
+ # Session is used to process all web pages under a single session. This may
30
+ # be necessary when some web sites need you to login, or otherwise create
31
+ # a session ID with a cookie before you can continue processing pages.
32
+ class Session
33
+ ################################################################################
34
+ attr_reader :log
35
+
36
+ ################################################################################
37
+ attr_accessor :post
38
+
39
+ ################################################################################
40
+ attr_accessor :timeout
41
+
42
+ ################################################################################
43
+ attr_accessor :cookies
44
+
45
+ ################################################################################
46
+ attr_reader :uri
47
+
48
+ ################################################################################
49
+ attr_reader :crawler
50
+
51
+ ################################################################################
52
+ attr_reader :base_uris
53
+
54
+ ################################################################################
55
+ # Start a session using a HTTP GET
56
+ def self.from_get (uri, &block)
57
+ session = self.new
58
+ session.uri = uri
59
+ block ? yield(session) : session
60
+ end
61
+
62
+ ################################################################################
63
+ # Start a session using HTTP POST
64
+ def self.from_post (uri, post, &block)
65
+ session = self.new
66
+ session.uri = uri
67
+ session.post = post
68
+ block ? yield(session) : session
69
+ end
70
+
71
+ ################################################################################
72
+ # Start a session witout having to create a session with the web site first.
73
+ def self.start (log=nil,&block)
74
+ session = self.new(log)
75
+ block ? yield(session) : session
76
+ end
77
+
78
+ ################################################################################
79
+ def initialize log = nil
80
+ @uri = nil
81
+ @post = {}
82
+ @when = Time.at(0)
83
+ @timeout = 900
84
+ @cookies = Cookies.new
85
+ @base_uris = []
86
+ @crawler = Crawler.new(self)
87
+ @crawler.log = @log = log
88
+ @refreshing = false
89
+ end
90
+
91
+ ################################################################################
92
+ def uri= (uri)
93
+ @uri = uri
94
+ @base_uris << uri
95
+ end
96
+
97
+ ################################################################################
98
+ # Process a web page
99
+ def page (page_class, link, post={}, &block)
100
+ return if link.nil?
101
+ link = [link] unless link.respond_to?(:to_ary)
102
+ block ||= lambda {|data| data}
103
+ result = nil
104
+
105
+ link.each do |u|
106
+ fetch(u, post) do |res|
107
+ result = page_class.extract(res.body, u, self, &block)
108
+ end
109
+ end
110
+
111
+ result
112
+ end
113
+
114
+ ################################################################################
115
+ # Fetch a URL in the session, but without a Scrapes::Page
116
+ def fetch (uri, post={}, &block)
117
+ u = absolute_uri(uri)
118
+ @base_uris.push(u)
119
+ yield(@crawler.fetch(u, post))
120
+ @base_uris.pop
121
+ end
122
+
123
+ ################################################################################
124
+ # Refresh the session, sometimes necessary when you are getting pages out of the
125
+ # cache, but then go to the real web site and the session has expired.
126
+ def refresh
127
+ if !@refreshing and @uri and (Time.now - @when) > @timeout
128
+ begin
129
+ @refreshing = true
130
+ @when = Time.now
131
+ @cookies.clear
132
+
133
+ @crawler.cache.without_cache do
134
+ @crawler.fetch(uri, post)
135
+ end
136
+ ensure
137
+ @refreshing = false
138
+ end
139
+ end
140
+
141
+ self
142
+ end
143
+
144
+ ################################################################################
145
+ # Convert a relative URI to an absolute URI
146
+ def absolute_uri (uri)
147
+ return uri if @base_uris.empty?
148
+ base = URI.parse(@base_uris.last)
149
+ base.merge(uri).to_s
150
+ end
151
+
152
+ end
153
+ ################################################################################
154
+ end
155
+ ################################################################################