spider 0.5.4 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -0
- data/lib/spider/included_in_memcached.rb +6 -5
- data/lib/spider/spider_instance.rb +31 -10
- data/lib/spider.rb +3 -2
- metadata +8 -19
- data/AUTHORS +0 -17
- data/CHANGES +0 -68
- data/LICENSE +0 -21
- data/README.md +0 -175
- data/spec/spec_helper.rb +0 -90
- data/spec/spider/included_in_memcached_spec.rb +0 -43
- data/spec/spider/included_in_redis_spec.rb +0 -43
- data/spec/spider/spider_instance_spec.rb +0 -405
- data/spec/spider_spec.rb +0 -33
- data/spider.gemspec +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7286f875f41881c9c8c385987c322b9832e5b07c6147aa4910a900e59015927e
|
4
|
+
data.tar.gz: 145ecd718a34521c17f0f9e939cb66f964ca0a1f93533969caf7d54426b213a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ba771c7dbbe3df286475a5586ba2d2b63affe762b7bf504694e6865e39c8e7f5047811ac97285a1edc5d99cd478993fd2be9ee3ae244cc1690975f7ab3f0e779
|
7
|
+
data.tar.gz: 2bd17f25db36a267b5534ff663b8394e7439a1701970e6d23ea295818732133b23c4bb35f82aa1f6f90022edcaceafc47b3fee3f7902fd9028a2ec8ad697a2a4
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.7.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Use memcached to track cycles.
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'dalli'
|
4
4
|
|
5
5
|
# A specialized class using memcached to track items stored. It supports
|
6
6
|
# three operations: new, <<, and include? . Together these can be used to
|
@@ -12,10 +12,11 @@ require 'memcache'
|
|
12
12
|
# s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
13
13
|
# end
|
14
14
|
class IncludedInMemcached
|
15
|
-
# Construct a new IncludedInMemcached instance.
|
16
|
-
#
|
17
|
-
|
18
|
-
|
15
|
+
# Construct a new IncludedInMemcached instance. The first argument should be
|
16
|
+
# the memcached server address (e.g., 'localhost:11211'). Additional options
|
17
|
+
# can be passed as a hash (see Dalli::Client documentation).
|
18
|
+
def initialize(server, options = {})
|
19
|
+
@c = Dalli::Client.new(server, options)
|
19
20
|
end
|
20
21
|
|
21
22
|
# Add an item to the memcache.
|
@@ -165,7 +165,6 @@ class SpiderInstance
|
|
165
165
|
trap("SIGINT") { @interrupted = true }
|
166
166
|
begin
|
167
167
|
next_urls = @next_urls.pop
|
168
|
-
tmp_n_u = {}
|
169
168
|
next_urls.each do |prior_url, urls|
|
170
169
|
urls = [urls] unless urls.kind_of?(Array)
|
171
170
|
urls.map do |a_url|
|
@@ -176,12 +175,9 @@ class SpiderInstance
|
|
176
175
|
@setup.call(a_url) unless @setup.nil?
|
177
176
|
get_page(parsed_url) do |response|
|
178
177
|
do_callbacks(a_url, response, prior_url)
|
179
|
-
#tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
180
|
-
#@next_urls.push tmp_n_u
|
181
178
|
generate_next_urls(a_url, response).each do |a_next_url|
|
182
179
|
@next_urls.push a_url => a_next_url
|
183
180
|
end
|
184
|
-
#exit if interrupted
|
185
181
|
end
|
186
182
|
@teardown.call(a_url) unless @teardown.nil?
|
187
183
|
break if @interrupted
|
@@ -256,7 +252,7 @@ class SpiderInstance
|
|
256
252
|
def do_callbacks(a_url, resp, prior_url) #:nodoc:
|
257
253
|
cbs = [@callbacks[:every],
|
258
254
|
resp.success? ? @callbacks[:success] : @callbacks[:failure],
|
259
|
-
@callbacks[resp.code]]
|
255
|
+
@callbacks[resp.code.to_i]]
|
260
256
|
|
261
257
|
cbs.each do |cb|
|
262
258
|
cb.call(a_url, resp, prior_url) if cb
|
@@ -264,11 +260,34 @@ class SpiderInstance
|
|
264
260
|
end
|
265
261
|
|
266
262
|
def generate_next_urls(a_url, resp) #:nodoc:
|
263
|
+
# Only scan for links if the content-type is HTML or the URL ends with .html
|
264
|
+
content_type = resp['Content-Type'] || resp['content-type'] || ''
|
265
|
+
url_ends_with_html = a_url.downcase.end_with?('.html')
|
266
|
+
|
267
|
+
unless content_type.downcase.include?('text/html') || url_ends_with_html
|
268
|
+
return []
|
269
|
+
end
|
270
|
+
|
267
271
|
web_page = resp.body
|
268
272
|
base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
|
269
273
|
[a_url[0,a_url.rindex('/')]])[0]
|
270
274
|
base_url = remove_trailing_slash(base_url)
|
271
|
-
|
275
|
+
|
276
|
+
# Extract anchor tags with href attributes, respecting rel="nofollow"
|
277
|
+
web_page.scan(/<a\s[^>]*href="([^"]*)"[^>]*>/i).flatten.map do |link|
|
278
|
+
# Get the full anchor tag to check for rel attribute
|
279
|
+
anchor_match = web_page.match(/<a\s[^>]*href="#{Regexp.escape(link)}"[^>]*>/i)
|
280
|
+
next nil unless anchor_match
|
281
|
+
|
282
|
+
anchor_tag = anchor_match[0]
|
283
|
+
|
284
|
+
# Check if this link has rel="nofollow" or similar attributes that should be respected
|
285
|
+
if anchor_tag.match(/rel\s*=\s*["']([^"']*nofollow[^"']*)["']/i) ||
|
286
|
+
anchor_tag.match(/rel\s*=\s*["']([^"']*sponsored[^"']*)["']/i) ||
|
287
|
+
anchor_tag.match(/rel\s*=\s*["']([^"']*ugc[^"']*)["']/i)
|
288
|
+
next nil # Skip links with nofollow, sponsored, or ugc rel attributes
|
289
|
+
end
|
290
|
+
|
272
291
|
begin
|
273
292
|
parsed_link = URI.parse(link)
|
274
293
|
if parsed_link.fragment == '#'
|
@@ -287,14 +306,16 @@ class SpiderInstance
|
|
287
306
|
case parsed_additional_url.scheme
|
288
307
|
when nil
|
289
308
|
u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
|
309
|
+
# Include port if it's not the default port
|
310
|
+
port_part = (u.port && ((u.scheme == 'http' && u.port != 80) || (u.scheme == 'https' && u.port != 443))) ? ":#{u.port}" : ""
|
290
311
|
if additional_url[0].chr == '/'
|
291
|
-
"#{u.scheme}://#{u.host}#{additional_url}"
|
312
|
+
"#{u.scheme}://#{u.host}#{port_part}#{additional_url}"
|
292
313
|
elsif u.path.nil? || u.path == ''
|
293
|
-
"#{u.scheme}://#{u.host}/#{additional_url}"
|
314
|
+
"#{u.scheme}://#{u.host}#{port_part}/#{additional_url}"
|
294
315
|
elsif u.path[0].chr == '/'
|
295
|
-
"#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
|
316
|
+
"#{u.scheme}://#{u.host}#{port_part}#{u.path}/#{additional_url}"
|
296
317
|
else
|
297
|
-
"#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
|
318
|
+
"#{u.scheme}://#{u.host}#{port_part}/#{u.path}/#{additional_url}"
|
298
319
|
end
|
299
320
|
else
|
300
321
|
additional_url
|
data/lib/spider.rb
CHANGED
@@ -4,8 +4,9 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
|
|
4
4
|
# links, and doing it all over again.
|
5
5
|
class Spider
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
VERSION = File.read(
|
8
|
+
File.expand_path('../VERSION', __dir__)
|
9
|
+
).strip.freeze
|
9
10
|
|
10
11
|
def self.version
|
11
12
|
VERSION
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Nagro
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies: []
|
13
12
|
description: |
|
14
13
|
A Web spidering library: handles robots.txt, scraping, finding more
|
@@ -18,10 +17,7 @@ executables: []
|
|
18
17
|
extensions: []
|
19
18
|
extra_rdoc_files: []
|
20
19
|
files:
|
21
|
-
-
|
22
|
-
- CHANGES
|
23
|
-
- LICENSE
|
24
|
-
- README.md
|
20
|
+
- VERSION
|
25
21
|
- lib/spider.rb
|
26
22
|
- lib/spider/included_in_file.rb
|
27
23
|
- lib/spider/included_in_memcached.rb
|
@@ -29,17 +25,12 @@ files:
|
|
29
25
|
- lib/spider/next_urls_in_sqs.rb
|
30
26
|
- lib/spider/robot_rules.rb
|
31
27
|
- lib/spider/spider_instance.rb
|
32
|
-
- spec/spec_helper.rb
|
33
|
-
- spec/spider/included_in_memcached_spec.rb
|
34
|
-
- spec/spider/included_in_redis_spec.rb
|
35
|
-
- spec/spider/spider_instance_spec.rb
|
36
|
-
- spec/spider_spec.rb
|
37
|
-
- spider.gemspec
|
38
28
|
homepage: https://github.com/johnnagro/spider
|
39
29
|
licenses:
|
40
30
|
- MIT
|
41
|
-
metadata:
|
42
|
-
|
31
|
+
metadata:
|
32
|
+
source_code_uri: https://github.com/johnnagro/spider
|
33
|
+
changelog_uri: https://github.com/johnnagro/spider/blob/main/CHANGELOG.md
|
43
34
|
rdoc_options: []
|
44
35
|
require_paths:
|
45
36
|
- lib
|
@@ -47,16 +38,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
47
38
|
requirements:
|
48
39
|
- - ">="
|
49
40
|
- !ruby/object:Gem::Version
|
50
|
-
version: '
|
41
|
+
version: '2.5'
|
51
42
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
43
|
requirements:
|
53
44
|
- - ">="
|
54
45
|
- !ruby/object:Gem::Version
|
55
46
|
version: '0'
|
56
47
|
requirements: []
|
57
|
-
|
58
|
-
rubygems_version: 2.7.6
|
59
|
-
signing_key:
|
48
|
+
rubygems_version: 3.6.9
|
60
49
|
specification_version: 4
|
61
50
|
summary: A Web spidering library
|
62
51
|
test_files: []
|
data/AUTHORS
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
The Ruby Spider Gem would not be what it is today without the help of
|
2
|
-
the following kind souls:
|
3
|
-
|
4
|
-
Alexandre Rousseau
|
5
|
-
Brian Campbell
|
6
|
-
Henri Cook
|
7
|
-
James Edward Gray II
|
8
|
-
Jeremy Evans
|
9
|
-
Joao Eriberto Mota Filho
|
10
|
-
John Buckley
|
11
|
-
John Nagro
|
12
|
-
Matt Horan
|
13
|
-
Marc (@brigriffin)
|
14
|
-
Mike Burns (original author)
|
15
|
-
Olle Jonsson
|
16
|
-
Sander van der Vliet
|
17
|
-
Stuart Yamartino
|
data/CHANGES
DELETED
@@ -1,68 +0,0 @@
|
|
1
|
-
2018-04-23 v0.5.3
|
2
|
-
* release simply to add missing CHANGES notes
|
3
|
-
|
4
|
-
2018-04-23 v0.5.2
|
5
|
-
* fixed #2 thanks to @jeremyevans
|
6
|
-
* added Redis as cache wrapper thanks to @brigriffin
|
7
|
-
|
8
|
-
2016-09-04 v0.5.1
|
9
|
-
* added the ability to stop a crawl
|
10
|
-
|
11
|
-
2016-05-13 v0.5.0
|
12
|
-
* fixed #1 thanks to @eribertomota
|
13
|
-
* got it running on more recent versions of ruby
|
14
|
-
* cleaned up the docs a bit
|
15
|
-
* cleaned up the licensing and attribution
|
16
|
-
|
17
|
-
2009-05-21
|
18
|
-
* fixed an issue with robots.txt on ssl hosts
|
19
|
-
* fixed an issue with pulling robots.txt from disallowed hosts
|
20
|
-
* fixed a documentation error with ExpiredLinks
|
21
|
-
* Many thanks to Brian Campbell
|
22
|
-
|
23
|
-
2008-10-09
|
24
|
-
* fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley
|
25
|
-
|
26
|
-
2008-07-06
|
27
|
-
* Trap interrupts and shutdown gracefully
|
28
|
-
* Support for custom urls-to-crawl objects
|
29
|
-
* Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
|
30
|
-
|
31
|
-
2007-11-09:
|
32
|
-
* Handle redirects that assume a base URL.
|
33
|
-
|
34
|
-
2007-11-08:
|
35
|
-
* Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
|
36
|
-
spider subdirectory.
|
37
|
-
|
38
|
-
2007-11-02:
|
39
|
-
* Memcached support.
|
40
|
-
|
41
|
-
2007-10-31:
|
42
|
-
* Add `setup' and `teardown' handlers.
|
43
|
-
* Can set the headers for a HTTP request.
|
44
|
-
* Changed :any to :every .
|
45
|
-
* Changed the arguments to the :every, :success, :failure, and code handler.
|
46
|
-
|
47
|
-
2007-10-23:
|
48
|
-
* URLs without a page component but with a query component.
|
49
|
-
* HTTP Redirect.
|
50
|
-
* HTTPS.
|
51
|
-
* Version 0.2.1 .
|
52
|
-
|
53
|
-
2007-10-22:
|
54
|
-
* Use RSpec to ensure that it mostly works.
|
55
|
-
* Use WEBrick to create a small test server for additional testing.
|
56
|
-
* Completely re-do the API to prepare for future expansion.
|
57
|
-
* Add the ability to apply each URL to a series of custom allowed?-like
|
58
|
-
matchers.
|
59
|
-
* BSD license.
|
60
|
-
* Version 0.2.0 .
|
61
|
-
|
62
|
-
2007-03-30:
|
63
|
-
* Clean up the documentation.
|
64
|
-
|
65
|
-
2007-03-28:
|
66
|
-
* Change the tail recursion to a `while' loop, to please Ruby.
|
67
|
-
* Documentation.
|
68
|
-
* Initial release: version 0.1.0 .
|
data/LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
The MIT License (MIT)
|
2
|
-
|
3
|
-
Copyright (c) 2007-2016 Spider Team Authors
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
13
|
-
copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
data/README.md
DELETED
@@ -1,175 +0,0 @@
|
|
1
|
-
|
2
|
-
# Spider
|
3
|
-
_a Web spidering library for Ruby. It handles the robots.txt,
|
4
|
-
scraping, collecting, and looping so that you can just handle the data._
|
5
|
-
|
6
|
-
## Examples
|
7
|
-
|
8
|
-
### Crawl the Web, loading each page in turn, until you run out of memory
|
9
|
-
|
10
|
-
```ruby
|
11
|
-
require 'spider'
|
12
|
-
Spider.start_at('http://cashcats.biz/') {}
|
13
|
-
```
|
14
|
-
|
15
|
-
### To handle erroneous responses
|
16
|
-
|
17
|
-
```ruby
|
18
|
-
require 'spider'
|
19
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
20
|
-
s.on :failure do |a_url, resp, prior_url|
|
21
|
-
puts "URL failed: #{a_url}"
|
22
|
-
puts " linked from #{prior_url}"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
```
|
26
|
-
|
27
|
-
### Or handle successful responses
|
28
|
-
|
29
|
-
```ruby
|
30
|
-
require 'spider'
|
31
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
32
|
-
s.on :success do |a_url, resp, prior_url|
|
33
|
-
puts "#{a_url}: #{resp.code}"
|
34
|
-
puts resp.body
|
35
|
-
puts
|
36
|
-
end
|
37
|
-
end
|
38
|
-
```
|
39
|
-
|
40
|
-
### Limit to just one domain
|
41
|
-
|
42
|
-
```ruby
|
43
|
-
require 'spider'
|
44
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
45
|
-
s.add_url_check do |a_url|
|
46
|
-
a_url =~ %r{^http://cashcats.biz.*}
|
47
|
-
end
|
48
|
-
end
|
49
|
-
```
|
50
|
-
|
51
|
-
### Pass headers to some requests
|
52
|
-
|
53
|
-
```ruby
|
54
|
-
require 'spider'
|
55
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
56
|
-
s.setup do |a_url|
|
57
|
-
if a_url =~ %r{^http://.*wikipedia.*}
|
58
|
-
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
```
|
63
|
-
|
64
|
-
### Use memcached to track cycles
|
65
|
-
|
66
|
-
```ruby
|
67
|
-
require 'spider'
|
68
|
-
require 'spider/included_in_memcached'
|
69
|
-
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
70
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
71
|
-
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
72
|
-
end
|
73
|
-
```
|
74
|
-
|
75
|
-
### Use Redis to track cycles
|
76
|
-
|
77
|
-
```ruby
|
78
|
-
require 'spider'
|
79
|
-
require 'spider/included_in_redis'
|
80
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
81
|
-
s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
82
|
-
end
|
83
|
-
```
|
84
|
-
|
85
|
-
### Use Plain text to track cycles
|
86
|
-
|
87
|
-
```ruby
|
88
|
-
require 'spider'
|
89
|
-
require 'spider/included_in_redis'
|
90
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
91
|
-
s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
|
92
|
-
end
|
93
|
-
```
|
94
|
-
|
95
|
-
### Track cycles with a custom object
|
96
|
-
|
97
|
-
```ruby
|
98
|
-
require 'spider'
|
99
|
-
class ExpireLinks < Hash
|
100
|
-
def <<(v)
|
101
|
-
self[v] = Time.now
|
102
|
-
end
|
103
|
-
def include?(v)
|
104
|
-
self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
109
|
-
s.check_already_seen_with ExpireLinks.new
|
110
|
-
end
|
111
|
-
```
|
112
|
-
|
113
|
-
### Store nodes to visit with Amazon SQS
|
114
|
-
|
115
|
-
```ruby
|
116
|
-
require 'spider'
|
117
|
-
require 'spider/next_urls_in_sqs'
|
118
|
-
Spider.start_at('http://cashcats.biz') do |s|
|
119
|
-
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
120
|
-
end
|
121
|
-
```
|
122
|
-
|
123
|
-
### Store nodes to visit with a custom object
|
124
|
-
|
125
|
-
```ruby
|
126
|
-
require 'spider'
|
127
|
-
class MyArray < Array
|
128
|
-
def pop
|
129
|
-
super
|
130
|
-
end
|
131
|
-
|
132
|
-
def push(a_msg)
|
133
|
-
super(a_msg)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
Spider.start_at('http://cashcats.biz') do |s|
|
138
|
-
s.store_next_urls_with MyArray.new
|
139
|
-
end
|
140
|
-
```
|
141
|
-
|
142
|
-
### Create a URL graph
|
143
|
-
|
144
|
-
```ruby
|
145
|
-
require 'spider'
|
146
|
-
nodes = {}
|
147
|
-
Spider.start_at('http://cashcats.biz/') do |s|
|
148
|
-
s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
|
149
|
-
|
150
|
-
s.on(:every) do |a_url, resp, prior_url|
|
151
|
-
nodes[prior_url] ||= []
|
152
|
-
nodes[prior_url] << a_url
|
153
|
-
end
|
154
|
-
end
|
155
|
-
```
|
156
|
-
|
157
|
-
### Use a proxy
|
158
|
-
|
159
|
-
```ruby
|
160
|
-
require 'net/http_configuration'
|
161
|
-
require 'spider'
|
162
|
-
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
163
|
-
:proxy_port => 8881)
|
164
|
-
http_conf.apply do
|
165
|
-
Spider.start_at('http://img.4chan.org/b/') do |s|
|
166
|
-
s.on(:success) do |a_url, resp, prior_url|
|
167
|
-
File.open(a_url.gsub('/',':'),'w') do |f|
|
168
|
-
f.write(resp.body)
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
end
|
173
|
-
```
|
174
|
-
|
175
|
-
_Copyright (c) 2007-2016 Spider Team Authors_
|
data/spec/spec_helper.rb
DELETED
@@ -1,90 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'webrick'
|
3
|
-
require 'spec'
|
4
|
-
|
5
|
-
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
|
-
|
7
|
-
def local_require(*files)
|
8
|
-
files.each do |file|
|
9
|
-
require File.dirname(__FILE__)+'/../lib/'+file
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
class BeStaticServerPages
|
14
|
-
def initialize
|
15
|
-
@pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
|
16
|
-
@actual = nil
|
17
|
-
end
|
18
|
-
|
19
|
-
attr :actual, true
|
20
|
-
|
21
|
-
def matches?(actual)
|
22
|
-
@actual = actual
|
23
|
-
actual == @pages
|
24
|
-
end
|
25
|
-
|
26
|
-
def failure_message
|
27
|
-
"expected #{@pages.inspect}, got #{@actual.inspect}"
|
28
|
-
end
|
29
|
-
|
30
|
-
def description
|
31
|
-
"be the pages returned by the static server (#{@pages.inspect})"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def with_web_server(svlt)
|
36
|
-
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
37
|
-
:AccessLog => [])
|
38
|
-
server.mount('/', svlt)
|
39
|
-
Thread.new {server.start}
|
40
|
-
begin
|
41
|
-
yield
|
42
|
-
ensure
|
43
|
-
server.shutdown
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def with_memcached
|
48
|
-
system('memcached -d -P /tmp/spider-memcached.pid')
|
49
|
-
cacher = IncludedInMemcached.new('localhost:11211')
|
50
|
-
begin
|
51
|
-
yield
|
52
|
-
ensure
|
53
|
-
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def be_static_server_pages
|
58
|
-
BeStaticServerPages.new
|
59
|
-
end
|
60
|
-
|
61
|
-
class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
62
|
-
def do_GET(req, res)
|
63
|
-
res['Content-type'] = 'text/plain'
|
64
|
-
res.body = "response\n"
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
|
69
|
-
def do_GET(req, res)
|
70
|
-
res['Content-type'] = 'text/html'
|
71
|
-
if req.path == '/foo'
|
72
|
-
res.body = <<-END
|
73
|
-
<a href="/">a</a>
|
74
|
-
END
|
75
|
-
else
|
76
|
-
res.body = <<-END
|
77
|
-
<a href="/foo">b</a>
|
78
|
-
END
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
def null_logger
|
84
|
-
l = stub
|
85
|
-
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
86
|
-
l.stubs(k)
|
87
|
-
l.stubs("#{k}?".to_sym)
|
88
|
-
end
|
89
|
-
l
|
90
|
-
end
|
@@ -1,43 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__)+'/../spec_helper'
|
2
|
-
|
3
|
-
def before_specing_memcached
|
4
|
-
local_require 'spider/included_in_memcached'
|
5
|
-
system('memcached -d -P /tmp/spider-memcached.pid')
|
6
|
-
end
|
7
|
-
|
8
|
-
def after_specing_memcached
|
9
|
-
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
10
|
-
end
|
11
|
-
|
12
|
-
Spec::Runner.configure { |c| c.mock_with :mocha }
|
13
|
-
|
14
|
-
describe 'Object to halt cycles' do
|
15
|
-
before do
|
16
|
-
before_specing_memcached
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'should understand <<' do
|
20
|
-
c = IncludedInMemcached.new('localhost:11211')
|
21
|
-
c.should respond_to(:<<)
|
22
|
-
end
|
23
|
-
|
24
|
-
it 'should understand included?' do
|
25
|
-
c = IncludedInMemcached.new('localhost:11211')
|
26
|
-
c.should respond_to(:include?)
|
27
|
-
end
|
28
|
-
|
29
|
-
it 'should produce false if the object is not included' do
|
30
|
-
c = IncludedInMemcached.new('localhost:11211')
|
31
|
-
c.include?('a').should be_false
|
32
|
-
end
|
33
|
-
|
34
|
-
it 'should produce true if the object is included' do
|
35
|
-
c = IncludedInMemcached.new('localhost:11211')
|
36
|
-
c << 'a'
|
37
|
-
c.include?('a').should be_true
|
38
|
-
end
|
39
|
-
|
40
|
-
after do
|
41
|
-
after_specing_memcached
|
42
|
-
end
|
43
|
-
end
|
@@ -1,43 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__)+'/../spec_helper'
|
2
|
-
|
3
|
-
def before_specing_redis
|
4
|
-
local_require 'spider/included_in_redis'
|
5
|
-
system('redis-server 127.0.0.1:6379')
|
6
|
-
end
|
7
|
-
|
8
|
-
def after_specing_redis
|
9
|
-
system('kill -KILL `pidof redis-server`')
|
10
|
-
end
|
11
|
-
|
12
|
-
Spec::Runner.configure { |c| c.mock_with :mocha }
|
13
|
-
|
14
|
-
describe 'Object to halt cycles' do
|
15
|
-
before do
|
16
|
-
before_specing_redis
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'should understand <<' do
|
20
|
-
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
21
|
-
c.should respond_to(:<<)
|
22
|
-
end
|
23
|
-
|
24
|
-
it 'should understand included?' do
|
25
|
-
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
26
|
-
c.should respond_to(:include?)
|
27
|
-
end
|
28
|
-
|
29
|
-
it 'should produce false if the object is not included' do
|
30
|
-
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
31
|
-
c.include?('a').should be_false
|
32
|
-
end
|
33
|
-
|
34
|
-
it 'should produce true if the object is included' do
|
35
|
-
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
36
|
-
c << 'a'
|
37
|
-
c.include?('a').should be_true
|
38
|
-
end
|
39
|
-
|
40
|
-
after do
|
41
|
-
after_specing_redis
|
42
|
-
end
|
43
|
-
end
|
@@ -1,405 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__)+'/../spec_helper'
|
2
|
-
require 'webrick'
|
3
|
-
require 'webrick/https'
|
4
|
-
local_require 'spider', 'spider/included_in_memcached'
|
5
|
-
|
6
|
-
describe 'SpiderInstance' do
|
7
|
-
# http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
|
8
|
-
# URL. Bug reported by Henri Cook.
|
9
|
-
it 'should construct a complete redirect URL' do
|
10
|
-
@response_called = false
|
11
|
-
redirected_resp = stub(:redirect? => true,
|
12
|
-
:[] => '/default.htm')
|
13
|
-
success_resp = stub(:redirect? => false)
|
14
|
-
http_req = stub(:request => true)
|
15
|
-
http_mock_redir = stub(:use_ssl= => true)
|
16
|
-
http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
|
17
|
-
http_mock_success = stub(:use_ssl= => true)
|
18
|
-
http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
|
19
|
-
Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
|
20
|
-
returns(http_mock_success)
|
21
|
-
si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
|
22
|
-
si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
|
23
|
-
@response_called = true
|
24
|
-
end
|
25
|
-
@response_called.should be_true
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'should prevent cycles with an IncludedInMemcached' do
|
29
|
-
with_memcached do
|
30
|
-
cacher = IncludedInMemcached.new('localhost:11211')
|
31
|
-
it_should_prevent_cycles_with(cacher)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'should prevent cycles with an Array' do
|
36
|
-
cacher = Array.new
|
37
|
-
it_should_prevent_cycles_with(cacher)
|
38
|
-
end
|
39
|
-
|
40
|
-
it 'should call the "setup" callback before loading the Web page' do
|
41
|
-
mock_successful_http
|
42
|
-
@on_called = false
|
43
|
-
@before_called = false
|
44
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
45
|
-
si.stubs(:allowed?).returns(true)
|
46
|
-
si.stubs(:generate_next_urls).returns([])
|
47
|
-
si.setup { |*a| @before_called = Time.now }
|
48
|
-
si.on(:every) { |*a| @on_called = Time.now }
|
49
|
-
si.start!
|
50
|
-
@on_called.should_not be_false
|
51
|
-
@before_called.should_not be_false
|
52
|
-
@before_called.should_not be_false
|
53
|
-
@before_called.should < @on_called
|
54
|
-
end
|
55
|
-
|
56
|
-
it 'should call the "teardown" callback after running all other callbacks' do
|
57
|
-
mock_successful_http
|
58
|
-
@on_called = false
|
59
|
-
@after_called = false
|
60
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
61
|
-
si.stubs(:allowed?).returns(true)
|
62
|
-
si.stubs(:generate_next_urls).returns([])
|
63
|
-
si.on(:every) { |*a| @on_called = Time.now }
|
64
|
-
si.teardown { |*a| @after_called = Time.now }
|
65
|
-
si.start!
|
66
|
-
@on_called.should_not be_false
|
67
|
-
@after_called.should_not be_false
|
68
|
-
@after_called.should_not be_false
|
69
|
-
@after_called.should > @on_called
|
70
|
-
end
|
71
|
-
|
72
|
-
it 'should pass headers set by a setup handler to the HTTP request' do
|
73
|
-
mock_successful_http
|
74
|
-
Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
|
75
|
-
si = SpiderInstance.new(nil => ['http://example.com/foo'])
|
76
|
-
si.stubs(:allowable_url?).returns(true)
|
77
|
-
si.stubs(:generate_next_urls).returns([])
|
78
|
-
si.setup do |a_url|
|
79
|
-
si.headers['X-Header-Set'] = 'True'
|
80
|
-
end
|
81
|
-
si.teardown do |a_url|
|
82
|
-
si.clear_headers
|
83
|
-
end
|
84
|
-
si.start!
|
85
|
-
end
|
86
|
-
|
87
|
-
it 'should call the :every callback with the current URL, the response, and the prior URL' do
|
88
|
-
mock_successful_http
|
89
|
-
callback_arguments_on(:every)
|
90
|
-
end
|
91
|
-
|
92
|
-
it 'should call the :success callback with the current URL, the request, and the prior URL' do
|
93
|
-
mock_successful_http
|
94
|
-
callback_arguments_on(:success)
|
95
|
-
end
|
96
|
-
|
97
|
-
it 'should call the :failure callback with the current URL, the request, and the prior URL' do
|
98
|
-
mock_failed_http
|
99
|
-
callback_arguments_on(:failure)
|
100
|
-
end
|
101
|
-
|
102
|
-
it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
|
103
|
-
mock_failed_http
|
104
|
-
callback_arguments_on(404)
|
105
|
-
end
|
106
|
-
|
107
|
-
it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
|
108
|
-
mock_successful_http
|
109
|
-
callback_arguments_on(200)
|
110
|
-
end
|
111
|
-
|
112
|
-
# Bug reported by John Nagro, using the example source http://eons.com/
|
113
|
-
# had to change line 192; uses request_uri now instead of path.
|
114
|
-
it 'should handle query URLs without a path' do
|
115
|
-
u = 'http://localhost:8888?s=1'
|
116
|
-
u_p = URI.parse(u)
|
117
|
-
@block_called = false
|
118
|
-
with_web_server(QueryServlet) do
|
119
|
-
si = SpiderInstance.new({nil => [u]})
|
120
|
-
si.get_page(u_p) do
|
121
|
-
@block_called = true
|
122
|
-
end
|
123
|
-
end
|
124
|
-
@block_called.should be_true
|
125
|
-
end
|
126
|
-
|
127
|
-
# This solves a problem reported by John Nagro.
|
128
|
-
it 'should handle redirects' do
|
129
|
-
u = 'http://example.com/'
|
130
|
-
u_p = URI.parse(u)
|
131
|
-
@redirect_handled = false
|
132
|
-
mock_redirect_http
|
133
|
-
si = SpiderInstance.new({nil => [u]})
|
134
|
-
si.get_page(u_p) do
|
135
|
-
@redirect_handled = true
|
136
|
-
end
|
137
|
-
@redirect_handled.should be_true
|
138
|
-
end
|
139
|
-
|
140
|
-
it 'should handle HTTPS' do
|
141
|
-
u = 'https://localhost:10443/'
|
142
|
-
u_p = URI.parse(u)
|
143
|
-
@page_called = false
|
144
|
-
server = WEBrick::HTTPServer.new(:Port => 10443,
|
145
|
-
:Logger => null_logger,
|
146
|
-
:AccessLog => [],
|
147
|
-
:SSLEnable => true,
|
148
|
-
:SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
|
149
|
-
:SSLComment => 'Comment of some sort')
|
150
|
-
server.mount('/', QueryServlet)
|
151
|
-
Thread.new {server.start}
|
152
|
-
si = SpiderInstance.new({nil => [u]})
|
153
|
-
si.get_page(u_p) { @page_called = true }
|
154
|
-
server.shutdown
|
155
|
-
@page_called.should be_true
|
156
|
-
end
|
157
|
-
|
158
|
-
it 'should skip URLs when allowable_url? is false' do
|
159
|
-
u = 'http://example.com/'
|
160
|
-
u_p = URI.parse(u)
|
161
|
-
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
162
|
-
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
163
|
-
si = SpiderInstance.new({nil => [u]})
|
164
|
-
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
165
|
-
si.expects(:get_page).times(0)
|
166
|
-
si.start!
|
167
|
-
end
|
168
|
-
|
169
|
-
it 'should not skip URLs when allowable_url? is true' do
|
170
|
-
u = 'http://example.com/'
|
171
|
-
u_p = URI.parse(u)
|
172
|
-
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
173
|
-
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
174
|
-
si = SpiderInstance.new({nil => [u]})
|
175
|
-
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
176
|
-
si.expects(:get_page).with(URI.parse(u))
|
177
|
-
si.start!
|
178
|
-
end
|
179
|
-
|
180
|
-
it 'should disallow URLs when the robots.txt says to' do
|
181
|
-
robot_rules = stub
|
182
|
-
SpiderInstance.any_instance.expects(:open).
|
183
|
-
with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
|
184
|
-
'Accept' => 'text/html,text/xml,application/xml,text/plain').
|
185
|
-
yields(stub(:read => 'robots.txt content'))
|
186
|
-
robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
|
187
|
-
'robots.txt content')
|
188
|
-
robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
|
189
|
-
si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
|
190
|
-
allowable = si.allowable_url?('http://example.com/',
|
191
|
-
URI.parse('http://example.com/'))
|
192
|
-
allowable.should be_false
|
193
|
-
end
|
194
|
-
|
195
|
-
it 'should disallow URLs when they fail any url_check' do
|
196
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
197
|
-
si.stubs(:allowed?).returns(true)
|
198
|
-
si.add_url_check { |a_url| false }
|
199
|
-
allowable = si.allowable_url?('http://example.com/',
|
200
|
-
URI.parse('http://example.com/'))
|
201
|
-
allowable.should be_false
|
202
|
-
end
|
203
|
-
|
204
|
-
it 'should support multiple url_checks' do
|
205
|
-
@first_url_check = false
|
206
|
-
@second_url_check = false
|
207
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
208
|
-
si.stubs(:allowed?).returns(true)
|
209
|
-
si.add_url_check do |a_url|
|
210
|
-
@first_url_check = true
|
211
|
-
true
|
212
|
-
end
|
213
|
-
si.add_url_check do |a_url|
|
214
|
-
@second_url_check = true
|
215
|
-
false
|
216
|
-
end
|
217
|
-
allowable = si.allowable_url?('http://example.com/',
|
218
|
-
URI.parse('http://example.com/'))
|
219
|
-
allowable.should be_false
|
220
|
-
@first_url_check.should be_true
|
221
|
-
@second_url_check.should be_true
|
222
|
-
end
|
223
|
-
|
224
|
-
it 'should avoid cycles' do
|
225
|
-
u = 'http://example.com/'
|
226
|
-
u_p = URI.parse(u)
|
227
|
-
si = SpiderInstance.new({nil => [u]}, [u_p])
|
228
|
-
si.stubs(:allowed?).returns(true)
|
229
|
-
allowable = si.allowable_url?(u, u_p)
|
230
|
-
allowable.should be_false
|
231
|
-
u_p.should_not be_nil
|
232
|
-
end
|
233
|
-
|
234
|
-
it 'should call the 404 handler for 404s' do
|
235
|
-
@proc_called = false
|
236
|
-
mock_failed_http
|
237
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
238
|
-
si.stubs(:allowed?).returns(true)
|
239
|
-
si.stubs(:generate_next_urls).returns([])
|
240
|
-
si.on(404) {|*a| @proc_called = true}
|
241
|
-
si.start!
|
242
|
-
@proc_called.should be_true
|
243
|
-
end
|
244
|
-
|
245
|
-
it 'should call the :success handler on success' do
|
246
|
-
@proc_called = false
|
247
|
-
mock_successful_http
|
248
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
249
|
-
si.stubs(:allowed?).returns(true)
|
250
|
-
si.stubs(:generate_next_urls).returns([])
|
251
|
-
si.on(:success) {|*a| @proc_called = true}
|
252
|
-
si.start!
|
253
|
-
@proc_called.should be_true
|
254
|
-
end
|
255
|
-
|
256
|
-
it 'should not call the :success handler on failure' do
|
257
|
-
@proc_called = false
|
258
|
-
mock_failed_http
|
259
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
260
|
-
si.stubs(:allowed?).returns(true)
|
261
|
-
si.stubs(:generate_next_urls).returns([])
|
262
|
-
si.on(:success) {|*a| @proc_called = true}
|
263
|
-
si.start!
|
264
|
-
@proc_called.should be_false
|
265
|
-
end
|
266
|
-
|
267
|
-
it 'should call the :success handler and the 200 handler on 200' do
|
268
|
-
@proc_200_called = false
|
269
|
-
@proc_success_called = false
|
270
|
-
mock_successful_http
|
271
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
272
|
-
si.stubs(:allowed?).returns(true)
|
273
|
-
si.stubs(:generate_next_urls).returns([])
|
274
|
-
si.on(:success) {|*a| @proc_success_called = true}
|
275
|
-
si.on(200) {|*a| @proc_200_called = true}
|
276
|
-
si.start!
|
277
|
-
@proc_200_called.should be_true
|
278
|
-
@proc_success_called.should be_true
|
279
|
-
end
|
280
|
-
|
281
|
-
it 'should not call the :failure handler on success' do
|
282
|
-
@proc_called = false
|
283
|
-
mock_successful_http
|
284
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
285
|
-
si.stubs(:allowed?).returns(true)
|
286
|
-
si.stubs(:generate_next_urls).returns([])
|
287
|
-
si.on(:failure) {|*a| @proc_called = true}
|
288
|
-
si.start!
|
289
|
-
@proc_called.should be_false
|
290
|
-
end
|
291
|
-
|
292
|
-
it 'should call the :failure handler on failure' do
|
293
|
-
@proc_called = false
|
294
|
-
mock_failed_http
|
295
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
296
|
-
si.stubs(:allowed?).returns(true)
|
297
|
-
si.stubs(:generate_next_urls).returns([])
|
298
|
-
si.on(:failure) {|*a| @proc_called = true}
|
299
|
-
si.start!
|
300
|
-
@proc_called.should be_true
|
301
|
-
end
|
302
|
-
|
303
|
-
it 'should call the :failure handler and the 404 handler on 404' do
|
304
|
-
@proc_404_called = false
|
305
|
-
@proc_failure_called = false
|
306
|
-
mock_failed_http
|
307
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
308
|
-
si.stubs(:allowed?).returns(true)
|
309
|
-
si.stubs(:generate_next_urls).returns([])
|
310
|
-
si.on(:failure) {|*a| @proc_failure_called = true}
|
311
|
-
si.on(404) {|*a| @proc_404_called = true}
|
312
|
-
si.start!
|
313
|
-
@proc_404_called.should be_true
|
314
|
-
@proc_failure_called.should be_true
|
315
|
-
end
|
316
|
-
|
317
|
-
it 'should call the :every handler even when a handler for the error code is defined' do
|
318
|
-
@any_called = false
|
319
|
-
mock_successful_http
|
320
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
321
|
-
si.stubs(:allowed?).returns(true)
|
322
|
-
si.stubs(:generate_next_urls).returns([])
|
323
|
-
si.on(:every) { |*a| @any_called = true }
|
324
|
-
si.on(202) {|*a|}
|
325
|
-
si.start!
|
326
|
-
@any_called.should be_true
|
327
|
-
end
|
328
|
-
|
329
|
-
it 'should support a block as a response handler' do
|
330
|
-
@proc_called = false
|
331
|
-
mock_successful_http
|
332
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
333
|
-
si.stubs(:allowed?).returns(true)
|
334
|
-
si.stubs(:generate_next_urls).returns([])
|
335
|
-
si.on(:every) { |*a| @proc_called = true }
|
336
|
-
si.start!
|
337
|
-
@proc_called.should be_true
|
338
|
-
end
|
339
|
-
|
340
|
-
it 'should support a proc as a response handler' do
|
341
|
-
@proc_called = false
|
342
|
-
mock_successful_http
|
343
|
-
si = SpiderInstance.new({nil => ['http://example.com/']})
|
344
|
-
si.stubs(:allowed?).returns(true)
|
345
|
-
si.stubs(:generate_next_urls).returns([])
|
346
|
-
si.on(:every, Proc.new { |*a| @proc_called = true })
|
347
|
-
si.start!
|
348
|
-
@proc_called.should be_true
|
349
|
-
end
|
350
|
-
|
351
|
-
def mock_http(http_req)
|
352
|
-
http_obj = mock(:use_ssl= => true)
|
353
|
-
http_obj.expects(:start).
|
354
|
-
yields(mock(:request => http_req)).returns(http_req)
|
355
|
-
Net::HTTP.expects(:new).returns(http_obj)
|
356
|
-
end
|
357
|
-
|
358
|
-
def mock_successful_http
|
359
|
-
http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
|
360
|
-
mock_http(http_req)
|
361
|
-
end
|
362
|
-
|
363
|
-
def mock_failed_http
|
364
|
-
http_req = stub(:redirect? => false, :success? => false, :code => 404)
|
365
|
-
mock_http(http_req)
|
366
|
-
end
|
367
|
-
|
368
|
-
def mock_redirect_http
|
369
|
-
http_req = stub(:redirect? => true, :success? => false, :code => 404)
|
370
|
-
http_req.expects(:[]).with('Location').returns('http://example.com/')
|
371
|
-
http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
|
372
|
-
http_obj = mock(:use_ssl= => true)
|
373
|
-
http_obj.expects(:start).
|
374
|
-
yields(mock(:request => http_req)).returns(http_req)
|
375
|
-
http_obj2 = mock(:use_ssl= => true)
|
376
|
-
http_obj2.expects(:start).
|
377
|
-
yields(mock(:request => http_req2)).returns(http_req2)
|
378
|
-
Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
|
379
|
-
end
|
380
|
-
|
381
|
-
def callback_arguments_on(code)
|
382
|
-
si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
|
383
|
-
si.stubs(:allowed?).returns(true)
|
384
|
-
si.stubs(:generate_next_urls).returns([])
|
385
|
-
si.on(code) do |a_url, resp, prior_url|
|
386
|
-
a_url.should == 'http://example.com/'
|
387
|
-
resp.should_not be_nil
|
388
|
-
prior_url.should == 'http://foo.com/'
|
389
|
-
end
|
390
|
-
si.start!
|
391
|
-
end
|
392
|
-
|
393
|
-
def it_should_prevent_cycles_with(cacher)
|
394
|
-
u = 'http://localhost:8888/'
|
395
|
-
u_p = URI.parse(u)
|
396
|
-
u2 = 'http://localhost:8888/foo'
|
397
|
-
u_p2 = URI.parse(u2)
|
398
|
-
|
399
|
-
with_web_server(LoopingServlet) do
|
400
|
-
si = SpiderInstance.new(nil => [u])
|
401
|
-
si.check_already_seen_with cacher
|
402
|
-
si.start!
|
403
|
-
end
|
404
|
-
end
|
405
|
-
end
|
data/spec/spider_spec.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__)+'/spec_helper'
|
2
|
-
local_require 'spider', 'spider/included_in_memcached'
|
3
|
-
|
4
|
-
describe 'Spider' do
|
5
|
-
it 'should find two pages without cycles using defaults' do
|
6
|
-
u = []
|
7
|
-
with_web_server(LoopingServlet) do
|
8
|
-
u = find_pages_with_static_server
|
9
|
-
end
|
10
|
-
u.should be_static_server_pages
|
11
|
-
end
|
12
|
-
|
13
|
-
it 'should find two pages without cycles using memcached' do
|
14
|
-
u = []
|
15
|
-
with_web_server(LoopingServlet) do
|
16
|
-
with_memcached do
|
17
|
-
u = find_pages_with_static_server do |s|
|
18
|
-
s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
u.should be_static_server_pages
|
23
|
-
end
|
24
|
-
|
25
|
-
def find_pages_with_static_server(&block)
|
26
|
-
pages = []
|
27
|
-
Spider.start_at('http://localhost:8888/') do |s|
|
28
|
-
block.call(s) unless block.nil?
|
29
|
-
s.on(:every){ |u,r,p| pages << u }
|
30
|
-
end
|
31
|
-
pages
|
32
|
-
end
|
33
|
-
end
|
data/spider.gemspec
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
|
3
|
-
require File.expand_path('../lib/spider', __FILE__)
|
4
|
-
|
5
|
-
spec = Gem::Specification.new do |s|
|
6
|
-
s.author = 'John Nagro'
|
7
|
-
s.email = 'john.nagro@gmail.com'
|
8
|
-
s.license = 'MIT'
|
9
|
-
s.has_rdoc = true
|
10
|
-
s.homepage = 'https://github.com/johnnagro/spider'
|
11
|
-
s.name = 'spider'
|
12
|
-
s.summary = 'A Web spidering library'
|
13
|
-
s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
|
14
|
-
s.require_path = 'lib'
|
15
|
-
s.description = <<-EOF
|
16
|
-
A Web spidering library: handles robots.txt, scraping, finding more
|
17
|
-
links, and doing it all over again.
|
18
|
-
EOF
|
19
|
-
s.version = Spider::VERSION
|
20
|
-
end
|