spider 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spider.rb CHANGED
@@ -1,126 +1,254 @@
1
1
  # Copyright 2007 Mike Burns
2
+ # :include: README
2
3
 
3
- # This program is free software; you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation; either version 2 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program; if not, write to the Free Software
15
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name Mike Burns nor the
12
+ # names of its contributors may be used to endorse or promote products
13
+ # derived from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16
25
 
17
26
  require 'robot_rules'
18
27
  require 'open-uri'
19
28
  require 'uri'
29
+ require 'net/http'
30
+
31
+ class Net::HTTPResponse #:nodoc:
32
+ def success?; false; end
33
+ end
34
+ class Net::HTTPSuccess #:nodoc:
35
+ def success?; true; end
36
+ end
20
37
 
21
38
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
22
39
  # links, and doing it all over again.
23
- module Spider
40
+ class Spider
41
+ # Runs the spider starting at the given URL. Also takes a block that is given
42
+ # the SpiderInstance. Use the block to define the rules and handlers for
43
+ # the discovered Web pages.
44
+ #
45
+ # Spider.start_at('http://mike-burns.com/') do |s|
46
+ # s.add_url_check do |a_url|
47
+ # a_url =~ %r{^http://mike-burns.com.*}
48
+ # end
49
+ #
50
+ # s.on 404 do |a_url, err_code|
51
+ # puts "URL not found: #{a_url}"
52
+ # end
53
+ #
54
+ # s.on :success do |a_url, code, headers, body|
55
+ # puts "body: #{body}"
56
+ # end
57
+ #
58
+ # s.on :any do |a_url, resp|
59
+ # puts "URL returned anything: #{a_url} with this code #{resp.code}"
60
+ # end
61
+ # end
62
+
63
+ def self.start_at(a_url, &block)
64
+ rules = RobotRules.new('Ruby Spider 1.0')
65
+ a_spider = SpiderInstance.new([a_url], [], rules, [])
66
+ block.call(a_spider)
67
+ a_spider.start!
68
+ end
69
+ end
24
70
 
25
- # [String] (String String -> a) -> omega
26
- # The only function worth calling. Takes a list of seed URLs and a block.
27
- # This block is passed each URL and its Web page.
71
+ class SpiderInstance
72
+ def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
73
+ @url_checks = []
74
+ @cache = :memory
75
+ @callbacks = {:any => lambda {}, :success => {}, :failure => {}}
76
+ @next_urls = next_urls
77
+ @seen = seen
78
+ @rules = rules || RobotRules.new('Ruby Spider 1.0')
79
+ @robots_seen = robots_seen
80
+ end
81
+
82
+ # Add a predicate that determines whether to continue down this URL's path.
83
+ # All predicates must be true in order for a URL to proceed.
28
84
  #
29
- # Examples:
85
+ # Takes a block that takes a string and produces a boolean. For example, this
86
+ # will ensure that the URL starts with 'http://mike-burns.com':
30
87
  #
31
- # spider(['http://yahoo.com']) do |a_url, web_page|
32
- # puts "At #{a_url}"
88
+ # add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
89
+ def add_url_check(&block)
90
+ @url_checks << block
91
+ end
92
+
93
+ def use_cache(cache_type) #:nodoc:
94
+ @cache = cache_type
95
+ end
96
+
97
+ # Add a response handler. A response handler's trigger can be :any, :success,
98
+ # :failure, or any HTTP status code. The handler itself can be either a Proc
99
+ # or a block. The arguments to the block depends on the trigger:
100
+ #
101
+ # If the trigger is :any, the arguments are the URL as a string and an
102
+ # instance of Net::HTTPResponse.
103
+ #
104
+ # If the trigger is :success or any HTTP status code that represents a
105
+ # successful result, the arguments are the URL as a string, the HTTP status
106
+ # code, an instance of Net::HTTPSuccess, and the body of the result as a
107
+ # string.
108
+ #
109
+ # If the trigger is :failure or any HTTP status code that represents a failed
110
+ # result, the arguments are the URL as a string and the HTTP status code.
111
+ #
112
+ # For example:
113
+ #
114
+ # on 404 do |a_url, code|
115
+ # puts "URL not found: #{a_url}"
33
116
  # end
34
117
  #
35
- # spider(['http://mike-burns.com','http://matthoran.com']) do |u, page|
36
- # scrape_images(page).each { |img| store_image!(img) }
118
+ # on :success do |a_url, code, resp, body|
119
+ # puts a_url
120
+ # puts body
37
121
  # end
38
- def spider(urls)
39
- rules = RobotRules.new('Ruby Spider 1.0')
40
- next_urls = (urls.is_a?(Array) ? urls : [urls])
41
- seen = []
42
- robots_seen = []
43
- # This used to be (tail) recursive, but Ruby doesn't optimize that.
44
- # I have no idea if this iterative version is correct, but it seems it.
122
+ #
123
+ # on :any do |a_url, resp|
124
+ # puts "Given this code: #{resp.code}"
125
+ # end
126
+ def on(code, p = nil, &block)
127
+ f = p ? p : block
128
+ case code
129
+ when Fixnum
130
+ @callbacks[success_or_failure(code)][code] = f
131
+ else
132
+ if :any == code.to_sym
133
+ @callbacks[:any] = f
134
+ else
135
+ @callbacks[code.to_sym][:any] = f
136
+ end
137
+ end
138
+ end
139
+
140
+ def start! #:nodoc:
141
+ next_urls = @next_urls
45
142
  begin
46
143
  next_urls = next_urls.map do |a_url|
47
144
  [a_url, (URI.parse(a_url) rescue nil)]
48
145
  end.select do |a_url, parsed_url|
49
- !parsed_url.nil? && !seen.include?(a_url) &&
50
- allowed?(a_url, parsed_url, rules, robots_seen)
146
+ allowable_url?(a_url, parsed_url)
51
147
  end.map do |a_url, parsed_url|
52
- scrape_links(a_url, parsed_url) do |a_url,web_page|
53
- seen << a_url
54
- yield(a_url,web_page)
148
+ get_page(parsed_url) do |response|
149
+ do_callbacks(a_url, response)
150
+ generate_next_urls(a_url, response)
55
151
  end
56
- end.flatten.map { |a_url,parsed_url| a_url }
152
+ end.flatten
57
153
  end while !next_urls.empty?
58
154
  end
59
155
 
60
- private
156
+ def success_or_failure(code) #:nodoc:
157
+ if code > 199 && code < 300
158
+ :success
159
+ else
160
+ :failure
161
+ end
162
+ end
163
+
164
+ def allowable_url?(a_url, parsed_url) #:nodoc:
165
+ !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
166
+ @url_checks.map{|url_check|url_check.call(a_url)}.all?
167
+ end
61
168
 
62
169
  # True if the robots.txt for that URL allows access to it.
63
- def allowed?(a_url, parsed_url, rules, robots_seen) # :nodoc:
170
+ def allowed?(a_url, parsed_url) # :nodoc:
64
171
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
65
172
  begin
66
- unless robots_seen.include?(u)
173
+ unless @robots_seen.include?(u)
67
174
  open(u, 'User-Agent' => 'Ruby Spider',
68
175
  'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
69
- rules.parse(u, url.read)
176
+ @rules.parse(u, url.read)
70
177
  end
71
- robots_seen << u
178
+ @robots_seen << u
72
179
  end
73
- rules.allowed?(a_url)
180
+ @rules.allowed?(a_url)
74
181
  rescue OpenURI::HTTPError
75
- true
76
- rescue Timeout::Error # to keep it from crashing
77
- false
78
- rescue
182
+ true # No robots.txt
183
+ rescue Exception, Timeout::Error # to keep it from crashing
79
184
  false
80
185
  end
81
186
  end
82
187
 
83
- # Produce all the links on the page.
84
- def scrape_links(a_url, parsed_url) # :nodoc:
188
+ def get_page(parsed_url, &block) #:nodoc:
189
+ @seen << parsed_url
85
190
  begin
86
- open(a_url, 'User-Agent' => 'Ruby Spider',
87
- 'Accept' => 'text/html,text/xml,application/xml,text/plain') do |data|
88
- web_page = data.read
89
- base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
90
- [a_url[0,a_url.rindex('/')]])[0]
91
- links = web_page.scan(/href="(.*?)"/i).flatten.map do |link|
92
- begin
93
- parsed_link = URI.parse(link)
94
- if parsed_link.fragment == '#'
95
- nil
191
+ Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
192
+ r = http.request(Net::HTTP::Get.new(parsed_url.path))
193
+ if r.is_a?(Net::HTTPRedirection)
194
+ get_page(URI.parse(r['Location']), block)
195
+ else
196
+ block.call(r)
197
+ end
198
+ end
199
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
200
+ p e
201
+ nil
202
+ end
203
+ end
204
+
205
+ def do_callbacks(a_url, resp) #:nodoc:
206
+ @callbacks[:any].call(a_url, resp) if @callbacks[:any]
207
+ if resp.success?
208
+ cb_branch = @callbacks[:success]
209
+ cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
210
+ cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
211
+ else
212
+ cb_branch = @callbacks[:failure]
213
+ cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
214
+ cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
215
+ end
216
+ end
217
+
218
+ def generate_next_urls(a_url, resp) #:nodoc:
219
+ web_page = resp.body
220
+ base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
221
+ [a_url[0,a_url.rindex('/')]])[0]
222
+ base_url = remove_trailing_slash(base_url)
223
+ web_page.scan(/href="(.*?)"/i).flatten.map do |link|
224
+ begin
225
+ parsed_link = URI.parse(link)
226
+ if parsed_link.fragment == '#'
227
+ nil
228
+ else
229
+ case parsed_link.scheme
230
+ when 'http'
231
+ link
232
+ when nil
233
+ u = URI.parse(base_url)
234
+ if link[0].chr == '/'
235
+ "#{u.scheme}://#{u.host}:#{u.port}#{link}"
236
+ elsif u.path.nil? || u.path == ''
237
+ "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
96
238
  else
97
- case parsed_link.scheme
98
- when 'http'
99
- link
100
- when nil
101
- u = URI.parse(base_url)
102
- if link[0].chr == '/'
103
- "#{u.scheme}://#{u.host}:#{u.port}#{link}"
104
- else
105
- "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
106
- end
107
- else
108
- nil
109
- end
239
+ "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
110
240
  end
111
- rescue
241
+ else
112
242
  nil
113
243
  end
114
- end.reject{|link|link.nil?}
115
-
116
- yield(a_url,web_page)
117
- links
244
+ end
245
+ rescue
246
+ nil
118
247
  end
119
- rescue Timeout::Error # to keep it from crashing
120
- []
121
- rescue
122
- []
123
- end
248
+ end.compact
124
249
  end
125
250
 
251
+ def remove_trailing_slash(s)
252
+ s.sub(%r{/*$},'')
253
+ end
126
254
  end
@@ -0,0 +1,219 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+ require File.dirname(__FILE__)+'/../lib/spider'
4
+
5
+ Spec::Runner.configure { |c| c.mock_with :mocha }
6
+
7
+ describe 'SpiderInstance' do
8
+ it 'should skip URLs when allowable_url? is false' do
9
+ u = 'http://example.com/'
10
+ u_p = URI.parse(u)
11
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
12
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
13
+ si = SpiderInstance.new([u])
14
+ si.expects(:allowable_url?).with(u, u_p).returns(false)
15
+ si.expects(:get_page).times(0)
16
+ si.start!
17
+ end
18
+
19
+ it 'should not skip URLs when allowable_url? is true' do
20
+ u = 'http://example.com/'
21
+ u_p = URI.parse(u)
22
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
23
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
24
+ si = SpiderInstance.new([u])
25
+ si.expects(:allowable_url?).with(u, u_p).returns(true)
26
+ si.expects(:allowable_url?).with(nil, nil).returns(false)
27
+ si.expects(:get_page).with(URI.parse(u))
28
+ si.start!
29
+ end
30
+
31
+ it 'should disallow URLs when the robots.txt says to' do
32
+ robot_rules = stub
33
+ SpiderInstance.any_instance.expects(:open).
34
+ with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
35
+ 'Accept' => 'text/html,text/xml,application/xml,text/plain').
36
+ yields(stub(:read => 'robots.txt content'))
37
+ robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
38
+ 'robots.txt content')
39
+ robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
40
+ si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
41
+ allowable = si.allowable_url?('http://example.com/',
42
+ URI.parse('http://example.com/'))
43
+ allowable.should == false
44
+ end
45
+
46
+ it 'should disallow URLs when they fail any url_check' do
47
+ si = SpiderInstance.new(['http://example.com/'])
48
+ si.stubs(:allowed?).returns(true)
49
+ si.add_url_check { |a_url| false }
50
+ allowable = si.allowable_url?('http://example.com/',
51
+ URI.parse('http://example.com/'))
52
+ allowable.should == false
53
+ end
54
+
55
+ it 'should support multiple url_checks' do
56
+ @first_url_check = false
57
+ @second_url_check = false
58
+ si = SpiderInstance.new(['http://example.com/'])
59
+ si.stubs(:allowed?).returns(true)
60
+ si.add_url_check do |a_url|
61
+ @first_url_check = true
62
+ true
63
+ end
64
+ si.add_url_check do |a_url|
65
+ @second_url_check = true
66
+ false
67
+ end
68
+ allowable = si.allowable_url?('http://example.com/',
69
+ URI.parse('http://example.com/'))
70
+ allowable.should == false
71
+ @first_url_check == true
72
+ @second_url_check == true
73
+ end
74
+
75
+ it 'should support memcached'
76
+ it 'should avoid cycles using memcached'
77
+
78
+ it 'should support memory' do
79
+ si = SpiderInstance.new(['http://example.com/'])
80
+ si.use_cache :memory # No exn
81
+ end
82
+
83
+ it 'should avoid cycles using memory' do
84
+ u = 'http://example.com/'
85
+ u_p = URI.parse(u)
86
+ si = SpiderInstance.new([u], [u_p])
87
+ si.stubs(:allowed?).returns(true)
88
+ allowable = si.allowable_url?(u, u_p)
89
+ allowable.should == false
90
+ u_p.should_not be_nil
91
+ end
92
+
93
+ it 'should call the 404 handler for 404s' do
94
+ @proc_called = false
95
+ http_resp = stub(:success? => false, :code => 404)
96
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
97
+ si = SpiderInstance.new(['http://example.com/'])
98
+ si.stubs(:allowed?).returns(true)
99
+ si.stubs(:generate_next_urls).returns([])
100
+ si.on(404) {|*a| @proc_called = true}
101
+ si.start!
102
+ @proc_called.should == true
103
+ end
104
+
105
+ it 'should call the :success handler on success' do
106
+ @proc_called = false
107
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
108
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
109
+ si = SpiderInstance.new(['http://example.com/'])
110
+ si.stubs(:allowed?).returns(true)
111
+ si.stubs(:generate_next_urls).returns([])
112
+ si.on(:success) {|*a| @proc_called = true}
113
+ si.start!
114
+ @proc_called.should == true
115
+ end
116
+
117
+ it 'should not call the :success handler on failure' do
118
+ @proc_called = false
119
+ http_resp = stub(:success? => false, :code => 404)
120
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
121
+ si = SpiderInstance.new(['http://example.com/'])
122
+ si.stubs(:allowed?).returns(true)
123
+ si.stubs(:generate_next_urls).returns([])
124
+ si.on(:success) {|*a| @proc_called = true}
125
+ si.start!
126
+ @proc_called.should == false
127
+ end
128
+
129
+ it 'should call the :success handler and the 200 handler on 200' do
130
+ @proc_200_called = false
131
+ @proc_success_called = false
132
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
133
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
134
+ si = SpiderInstance.new(['http://example.com/'])
135
+ si.stubs(:allowed?).returns(true)
136
+ si.stubs(:generate_next_urls).returns([])
137
+ si.on(:success) {|*a| @proc_success_called = true}
138
+ si.on(200) {|*a| @proc_200_called = true}
139
+ si.start!
140
+ @proc_200_called.should == true
141
+ @proc_success_called.should == true
142
+ end
143
+
144
+ it 'should not call the :failure handler on success' do
145
+ @proc_called = false
146
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
147
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
148
+ si = SpiderInstance.new(['http://example.com/'])
149
+ si.stubs(:allowed?).returns(true)
150
+ si.stubs(:generate_next_urls).returns([])
151
+ si.on(:failure) {|*a| @proc_called = true}
152
+ si.start!
153
+ @proc_called.should == false
154
+ end
155
+
156
+ it 'should call the :failure handler on failure' do
157
+ @proc_called = false
158
+ http_resp = stub(:success? => false, :code => 404)
159
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
160
+ si = SpiderInstance.new(['http://example.com/'])
161
+ si.stubs(:allowed?).returns(true)
162
+ si.stubs(:generate_next_urls).returns([])
163
+ si.on(:failure) {|*a| @proc_called = true}
164
+ si.start!
165
+ @proc_called.should == true
166
+ end
167
+
168
+ it 'should call the :failure handler and the 404 handler on 404' do
169
+ @proc_404_called = false
170
+ @proc_failure_called = false
171
+ http_resp = stub(:success? => false, :code => 404)
172
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
173
+ si = SpiderInstance.new(['http://example.com/'])
174
+ si.stubs(:allowed?).returns(true)
175
+ si.stubs(:generate_next_urls).returns([])
176
+ si.on(:failure) {|*a| @proc_failure_called = true}
177
+ si.on(404) {|*a| @proc_404_called = true}
178
+ si.start!
179
+ @proc_404_called.should == true
180
+ @proc_failure_called.should == true
181
+ end
182
+
183
+ it 'should call the :any handler even when a handler for the error code is defined' do
184
+ @any_called = false
185
+ http_resp = stub(:success? => true, :code => 200)
186
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
187
+ si = SpiderInstance.new(['http://example.com/'])
188
+ si.stubs(:allowed?).returns(true)
189
+ si.stubs(:generate_next_urls).returns([])
190
+ si.on(:any) { |*a| @any_called = true }
191
+ si.on(202) {|*a|}
192
+ si.start!
193
+ @any_called.should == true
194
+ end
195
+
196
+ it 'should support a block as a response handler' do
197
+ @proc_called = false
198
+ http_resp = stub(:success? => true, :code => 200)
199
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
200
+ si = SpiderInstance.new(['http://example.com/'])
201
+ si.stubs(:allowed?).returns(true)
202
+ si.stubs(:generate_next_urls).returns([])
203
+ si.on(:any) { |*a| @proc_called = true }
204
+ si.start!
205
+ @proc_called.should == true
206
+ end
207
+
208
+ it 'should support a proc as a response handler' do
209
+ @proc_called = false
210
+ http_resp = stub(:success? => true, :code => 200)
211
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
212
+ si = SpiderInstance.new(['http://example.com/'])
213
+ si.stubs(:allowed?).returns(true)
214
+ si.stubs(:generate_next_urls).returns([])
215
+ si.on(:any, Proc.new { |*a| @proc_called = true })
216
+ si.start!
217
+ @proc_called.should == true
218
+ end
219
+ end