spider 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spider.rb CHANGED
@@ -1,126 +1,254 @@
1
1
  # Copyright 2007 Mike Burns
2
+ # :include: README
2
3
 
3
- # This program is free software; you can redistribute it and/or modify
4
- # it under the terms of the GNU General Public License as published by
5
- # the Free Software Foundation; either version 2 of the License, or
6
- # (at your option) any later version.
7
- #
8
- # This program is distributed in the hope that it will be useful,
9
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
- # GNU General Public License for more details.
12
- #
13
- # You should have received a copy of the GNU General Public License
14
- # along with this program; if not, write to the Free Software
15
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name Mike Burns nor the
12
+ # names of its contributors may be used to endorse or promote products
13
+ # derived from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16
25
 
17
26
  require 'robot_rules'
18
27
  require 'open-uri'
19
28
  require 'uri'
29
+ require 'net/http'
30
+
31
+ class Net::HTTPResponse #:nodoc:
32
+ def success?; false; end
33
+ end
34
+ class Net::HTTPSuccess #:nodoc:
35
+ def success?; true; end
36
+ end
20
37
 
21
38
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
22
39
  # links, and doing it all over again.
23
- module Spider
40
+ class Spider
41
+ # Runs the spider starting at the given URL. Also takes a block that is given
42
+ # the SpiderInstance. Use the block to define the rules and handlers for
43
+ # the discovered Web pages.
44
+ #
45
+ # Spider.start_at('http://mike-burns.com/') do |s|
46
+ # s.add_url_check do |a_url|
47
+ # a_url =~ %r{^http://mike-burns.com.*}
48
+ # end
49
+ #
50
+ # s.on 404 do |a_url, err_code|
51
+ # puts "URL not found: #{a_url}"
52
+ # end
53
+ #
54
+ # s.on :success do |a_url, code, headers, body|
55
+ # puts "body: #{body}"
56
+ # end
57
+ #
58
+ # s.on :any do |a_url, resp|
59
+ # puts "URL returned anything: #{a_url} with this code #{resp.code}"
60
+ # end
61
+ # end
62
+
63
+ def self.start_at(a_url, &block)
64
+ rules = RobotRules.new('Ruby Spider 1.0')
65
+ a_spider = SpiderInstance.new([a_url], [], rules, [])
66
+ block.call(a_spider)
67
+ a_spider.start!
68
+ end
69
+ end
24
70
 
25
- # [String] (String String -> a) -> omega
26
- # The only function worth calling. Takes a list of seed URLs and a block.
27
- # This block is passed each URL and its Web page.
71
+ class SpiderInstance
72
+ def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
73
+ @url_checks = []
74
+ @cache = :memory
75
+ @callbacks = {:any => lambda {}, :success => {}, :failure => {}}
76
+ @next_urls = next_urls
77
+ @seen = seen
78
+ @rules = rules || RobotRules.new('Ruby Spider 1.0')
79
+ @robots_seen = robots_seen
80
+ end
81
+
82
+ # Add a predicate that determines whether to continue down this URL's path.
83
+ # All predicates must be true in order for a URL to proceed.
28
84
  #
29
- # Examples:
85
+ # Takes a block that takes a string and produces a boolean. For example, this
86
+ # will ensure that the URL starts with 'http://mike-burns.com':
30
87
  #
31
- # spider(['http://yahoo.com']) do |a_url, web_page|
32
- # puts "At #{a_url}"
88
+ # add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
89
+ def add_url_check(&block)
90
+ @url_checks << block
91
+ end
92
+
93
+ def use_cache(cache_type) #:nodoc:
94
+ @cache = cache_type
95
+ end
96
+
97
+ # Add a response handler. A response handler's trigger can be :any, :success,
98
+ # :failure, or any HTTP status code. The handler itself can be either a Proc
99
+ # or a block. The arguments to the block depends on the trigger:
100
+ #
101
+ # If the trigger is :any, the arguments are the URL as a string and an
102
+ # instance of Net::HTTPResponse.
103
+ #
104
+ # If the trigger is :success or any HTTP status code that represents a
105
+ # successful result, the arguments are the URL as a string, the HTTP status
106
+ # code, an instance of Net::HTTPSuccess, and the body of the result as a
107
+ # string.
108
+ #
109
+ # If the trigger is :failure or any HTTP status code that represents a failed
110
+ # result, the arguments are the URL as a string and the HTTP status code.
111
+ #
112
+ # For example:
113
+ #
114
+ # on 404 do |a_url, code|
115
+ # puts "URL not found: #{a_url}"
33
116
  # end
34
117
  #
35
- # spider(['http://mike-burns.com','http://matthoran.com']) do |u, page|
36
- # scrape_images(page).each { |img| store_image!(img) }
118
+ # on :success do |a_url, code, resp, body|
119
+ # puts a_url
120
+ # puts body
37
121
  # end
38
- def spider(urls)
39
- rules = RobotRules.new('Ruby Spider 1.0')
40
- next_urls = (urls.is_a?(Array) ? urls : [urls])
41
- seen = []
42
- robots_seen = []
43
- # This used to be (tail) recursive, but Ruby doesn't optimize that.
44
- # I have no idea if this iterative version is correct, but it seems it.
122
+ #
123
+ # on :any do |a_url, resp|
124
+ # puts "Given this code: #{resp.code}"
125
+ # end
126
+ def on(code, p = nil, &block)
127
+ f = p ? p : block
128
+ case code
129
+ when Fixnum
130
+ @callbacks[success_or_failure(code)][code] = f
131
+ else
132
+ if :any == code.to_sym
133
+ @callbacks[:any] = f
134
+ else
135
+ @callbacks[code.to_sym][:any] = f
136
+ end
137
+ end
138
+ end
139
+
140
+ def start! #:nodoc:
141
+ next_urls = @next_urls
45
142
  begin
46
143
  next_urls = next_urls.map do |a_url|
47
144
  [a_url, (URI.parse(a_url) rescue nil)]
48
145
  end.select do |a_url, parsed_url|
49
- !parsed_url.nil? && !seen.include?(a_url) &&
50
- allowed?(a_url, parsed_url, rules, robots_seen)
146
+ allowable_url?(a_url, parsed_url)
51
147
  end.map do |a_url, parsed_url|
52
- scrape_links(a_url, parsed_url) do |a_url,web_page|
53
- seen << a_url
54
- yield(a_url,web_page)
148
+ get_page(parsed_url) do |response|
149
+ do_callbacks(a_url, response)
150
+ generate_next_urls(a_url, response)
55
151
  end
56
- end.flatten.map { |a_url,parsed_url| a_url }
152
+ end.flatten
57
153
  end while !next_urls.empty?
58
154
  end
59
155
 
60
- private
156
+ def success_or_failure(code) #:nodoc:
157
+ if code > 199 && code < 300
158
+ :success
159
+ else
160
+ :failure
161
+ end
162
+ end
163
+
164
+ def allowable_url?(a_url, parsed_url) #:nodoc:
165
+ !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
166
+ @url_checks.map{|url_check|url_check.call(a_url)}.all?
167
+ end
61
168
 
62
169
  # True if the robots.txt for that URL allows access to it.
63
- def allowed?(a_url, parsed_url, rules, robots_seen) # :nodoc:
170
+ def allowed?(a_url, parsed_url) # :nodoc:
64
171
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
65
172
  begin
66
- unless robots_seen.include?(u)
173
+ unless @robots_seen.include?(u)
67
174
  open(u, 'User-Agent' => 'Ruby Spider',
68
175
  'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
69
- rules.parse(u, url.read)
176
+ @rules.parse(u, url.read)
70
177
  end
71
- robots_seen << u
178
+ @robots_seen << u
72
179
  end
73
- rules.allowed?(a_url)
180
+ @rules.allowed?(a_url)
74
181
  rescue OpenURI::HTTPError
75
- true
76
- rescue Timeout::Error # to keep it from crashing
77
- false
78
- rescue
182
+ true # No robots.txt
183
+ rescue Exception, Timeout::Error # to keep it from crashing
79
184
  false
80
185
  end
81
186
  end
82
187
 
83
- # Produce all the links on the page.
84
- def scrape_links(a_url, parsed_url) # :nodoc:
188
+ def get_page(parsed_url, &block) #:nodoc:
189
+ @seen << parsed_url
85
190
  begin
86
- open(a_url, 'User-Agent' => 'Ruby Spider',
87
- 'Accept' => 'text/html,text/xml,application/xml,text/plain') do |data|
88
- web_page = data.read
89
- base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
90
- [a_url[0,a_url.rindex('/')]])[0]
91
- links = web_page.scan(/href="(.*?)"/i).flatten.map do |link|
92
- begin
93
- parsed_link = URI.parse(link)
94
- if parsed_link.fragment == '#'
95
- nil
191
+ Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
192
+ r = http.request(Net::HTTP::Get.new(parsed_url.path))
193
+ if r.is_a?(Net::HTTPRedirection)
194
+ get_page(URI.parse(r['Location']), block)
195
+ else
196
+ block.call(r)
197
+ end
198
+ end
199
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
200
+ p e
201
+ nil
202
+ end
203
+ end
204
+
205
+ def do_callbacks(a_url, resp) #:nodoc:
206
+ @callbacks[:any].call(a_url, resp) if @callbacks[:any]
207
+ if resp.success?
208
+ cb_branch = @callbacks[:success]
209
+ cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
210
+ cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
211
+ else
212
+ cb_branch = @callbacks[:failure]
213
+ cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
214
+ cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
215
+ end
216
+ end
217
+
218
+ def generate_next_urls(a_url, resp) #:nodoc:
219
+ web_page = resp.body
220
+ base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
221
+ [a_url[0,a_url.rindex('/')]])[0]
222
+ base_url = remove_trailing_slash(base_url)
223
+ web_page.scan(/href="(.*?)"/i).flatten.map do |link|
224
+ begin
225
+ parsed_link = URI.parse(link)
226
+ if parsed_link.fragment == '#'
227
+ nil
228
+ else
229
+ case parsed_link.scheme
230
+ when 'http'
231
+ link
232
+ when nil
233
+ u = URI.parse(base_url)
234
+ if link[0].chr == '/'
235
+ "#{u.scheme}://#{u.host}:#{u.port}#{link}"
236
+ elsif u.path.nil? || u.path == ''
237
+ "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
96
238
  else
97
- case parsed_link.scheme
98
- when 'http'
99
- link
100
- when nil
101
- u = URI.parse(base_url)
102
- if link[0].chr == '/'
103
- "#{u.scheme}://#{u.host}:#{u.port}#{link}"
104
- else
105
- "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
106
- end
107
- else
108
- nil
109
- end
239
+ "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
110
240
  end
111
- rescue
241
+ else
112
242
  nil
113
243
  end
114
- end.reject{|link|link.nil?}
115
-
116
- yield(a_url,web_page)
117
- links
244
+ end
245
+ rescue
246
+ nil
118
247
  end
119
- rescue Timeout::Error # to keep it from crashing
120
- []
121
- rescue
122
- []
123
- end
248
+ end.compact
124
249
  end
125
250
 
251
+ def remove_trailing_slash(s)
252
+ s.sub(%r{/*$},'')
253
+ end
126
254
  end
@@ -0,0 +1,219 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+ require File.dirname(__FILE__)+'/../lib/spider'
4
+
5
+ Spec::Runner.configure { |c| c.mock_with :mocha }
6
+
7
+ describe 'SpiderInstance' do
8
+ it 'should skip URLs when allowable_url? is false' do
9
+ u = 'http://example.com/'
10
+ u_p = URI.parse(u)
11
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
12
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
13
+ si = SpiderInstance.new([u])
14
+ si.expects(:allowable_url?).with(u, u_p).returns(false)
15
+ si.expects(:get_page).times(0)
16
+ si.start!
17
+ end
18
+
19
+ it 'should not skip URLs when allowable_url? is true' do
20
+ u = 'http://example.com/'
21
+ u_p = URI.parse(u)
22
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
23
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
24
+ si = SpiderInstance.new([u])
25
+ si.expects(:allowable_url?).with(u, u_p).returns(true)
26
+ si.expects(:allowable_url?).with(nil, nil).returns(false)
27
+ si.expects(:get_page).with(URI.parse(u))
28
+ si.start!
29
+ end
30
+
31
+ it 'should disallow URLs when the robots.txt says to' do
32
+ robot_rules = stub
33
+ SpiderInstance.any_instance.expects(:open).
34
+ with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
35
+ 'Accept' => 'text/html,text/xml,application/xml,text/plain').
36
+ yields(stub(:read => 'robots.txt content'))
37
+ robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
38
+ 'robots.txt content')
39
+ robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
40
+ si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
41
+ allowable = si.allowable_url?('http://example.com/',
42
+ URI.parse('http://example.com/'))
43
+ allowable.should == false
44
+ end
45
+
46
+ it 'should disallow URLs when they fail any url_check' do
47
+ si = SpiderInstance.new(['http://example.com/'])
48
+ si.stubs(:allowed?).returns(true)
49
+ si.add_url_check { |a_url| false }
50
+ allowable = si.allowable_url?('http://example.com/',
51
+ URI.parse('http://example.com/'))
52
+ allowable.should == false
53
+ end
54
+
55
+ it 'should support multiple url_checks' do
56
+ @first_url_check = false
57
+ @second_url_check = false
58
+ si = SpiderInstance.new(['http://example.com/'])
59
+ si.stubs(:allowed?).returns(true)
60
+ si.add_url_check do |a_url|
61
+ @first_url_check = true
62
+ true
63
+ end
64
+ si.add_url_check do |a_url|
65
+ @second_url_check = true
66
+ false
67
+ end
68
+ allowable = si.allowable_url?('http://example.com/',
69
+ URI.parse('http://example.com/'))
70
+ allowable.should == false
71
+ @first_url_check == true
72
+ @second_url_check == true
73
+ end
74
+
75
+ it 'should support memcached'
76
+ it 'should avoid cycles using memcached'
77
+
78
+ it 'should support memory' do
79
+ si = SpiderInstance.new(['http://example.com/'])
80
+ si.use_cache :memory # No exn
81
+ end
82
+
83
+ it 'should avoid cycles using memory' do
84
+ u = 'http://example.com/'
85
+ u_p = URI.parse(u)
86
+ si = SpiderInstance.new([u], [u_p])
87
+ si.stubs(:allowed?).returns(true)
88
+ allowable = si.allowable_url?(u, u_p)
89
+ allowable.should == false
90
+ u_p.should_not be_nil
91
+ end
92
+
93
+ it 'should call the 404 handler for 404s' do
94
+ @proc_called = false
95
+ http_resp = stub(:success? => false, :code => 404)
96
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
97
+ si = SpiderInstance.new(['http://example.com/'])
98
+ si.stubs(:allowed?).returns(true)
99
+ si.stubs(:generate_next_urls).returns([])
100
+ si.on(404) {|*a| @proc_called = true}
101
+ si.start!
102
+ @proc_called.should == true
103
+ end
104
+
105
+ it 'should call the :success handler on success' do
106
+ @proc_called = false
107
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
108
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
109
+ si = SpiderInstance.new(['http://example.com/'])
110
+ si.stubs(:allowed?).returns(true)
111
+ si.stubs(:generate_next_urls).returns([])
112
+ si.on(:success) {|*a| @proc_called = true}
113
+ si.start!
114
+ @proc_called.should == true
115
+ end
116
+
117
+ it 'should not call the :success handler on failure' do
118
+ @proc_called = false
119
+ http_resp = stub(:success? => false, :code => 404)
120
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
121
+ si = SpiderInstance.new(['http://example.com/'])
122
+ si.stubs(:allowed?).returns(true)
123
+ si.stubs(:generate_next_urls).returns([])
124
+ si.on(:success) {|*a| @proc_called = true}
125
+ si.start!
126
+ @proc_called.should == false
127
+ end
128
+
129
+ it 'should call the :success handler and the 200 handler on 200' do
130
+ @proc_200_called = false
131
+ @proc_success_called = false
132
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
133
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
134
+ si = SpiderInstance.new(['http://example.com/'])
135
+ si.stubs(:allowed?).returns(true)
136
+ si.stubs(:generate_next_urls).returns([])
137
+ si.on(:success) {|*a| @proc_success_called = true}
138
+ si.on(200) {|*a| @proc_200_called = true}
139
+ si.start!
140
+ @proc_200_called.should == true
141
+ @proc_success_called.should == true
142
+ end
143
+
144
+ it 'should not call the :failure handler on success' do
145
+ @proc_called = false
146
+ http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
147
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
148
+ si = SpiderInstance.new(['http://example.com/'])
149
+ si.stubs(:allowed?).returns(true)
150
+ si.stubs(:generate_next_urls).returns([])
151
+ si.on(:failure) {|*a| @proc_called = true}
152
+ si.start!
153
+ @proc_called.should == false
154
+ end
155
+
156
+ it 'should call the :failure handler on failure' do
157
+ @proc_called = false
158
+ http_resp = stub(:success? => false, :code => 404)
159
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
160
+ si = SpiderInstance.new(['http://example.com/'])
161
+ si.stubs(:allowed?).returns(true)
162
+ si.stubs(:generate_next_urls).returns([])
163
+ si.on(:failure) {|*a| @proc_called = true}
164
+ si.start!
165
+ @proc_called.should == true
166
+ end
167
+
168
+ it 'should call the :failure handler and the 404 handler on 404' do
169
+ @proc_404_called = false
170
+ @proc_failure_called = false
171
+ http_resp = stub(:success? => false, :code => 404)
172
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
173
+ si = SpiderInstance.new(['http://example.com/'])
174
+ si.stubs(:allowed?).returns(true)
175
+ si.stubs(:generate_next_urls).returns([])
176
+ si.on(:failure) {|*a| @proc_failure_called = true}
177
+ si.on(404) {|*a| @proc_404_called = true}
178
+ si.start!
179
+ @proc_404_called.should == true
180
+ @proc_failure_called.should == true
181
+ end
182
+
183
+ it 'should call the :any handler even when a handler for the error code is defined' do
184
+ @any_called = false
185
+ http_resp = stub(:success? => true, :code => 200)
186
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
187
+ si = SpiderInstance.new(['http://example.com/'])
188
+ si.stubs(:allowed?).returns(true)
189
+ si.stubs(:generate_next_urls).returns([])
190
+ si.on(:any) { |*a| @any_called = true }
191
+ si.on(202) {|*a|}
192
+ si.start!
193
+ @any_called.should == true
194
+ end
195
+
196
+ it 'should support a block as a response handler' do
197
+ @proc_called = false
198
+ http_resp = stub(:success? => true, :code => 200)
199
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
200
+ si = SpiderInstance.new(['http://example.com/'])
201
+ si.stubs(:allowed?).returns(true)
202
+ si.stubs(:generate_next_urls).returns([])
203
+ si.on(:any) { |*a| @proc_called = true }
204
+ si.start!
205
+ @proc_called.should == true
206
+ end
207
+
208
+ it 'should support a proc as a response handler' do
209
+ @proc_called = false
210
+ http_resp = stub(:success? => true, :code => 200)
211
+ Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
212
+ si = SpiderInstance.new(['http://example.com/'])
213
+ si.stubs(:allowed?).returns(true)
214
+ si.stubs(:generate_next_urls).returns([])
215
+ si.on(:any, Proc.new { |*a| @proc_called = true })
216
+ si.start!
217
+ @proc_called.should == true
218
+ end
219
+ end