spider 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 60310268052e7e2cfe120fdd313e314b28124c5a
4
- data.tar.gz: 8747d828232c89f674e765e005df722c170a7839
2
+ SHA256:
3
+ metadata.gz: fa130209d8ff28f2cfb7c42ec1915ca39b5a0895e568882134a59b67757c432d
4
+ data.tar.gz: 424d6d26fd66353515cbc7407fb0c5554a3566f19da0df526940c97025f4ef2c
5
5
  SHA512:
6
- metadata.gz: a58cc23eb4d464b41957ca39ac17a3d2c91d86ac71140163b2072bedc317e3b529430fd47630980c0e87668df509031078a6d0a7d63d62878d5a31300b34bc51
7
- data.tar.gz: a2391c2b8e64dd4dd5f895e9530312bd515d00981c85d857ff8261d28bdaab2a1669e1d9fc9e15fa43f62f4eba99666465cbc0bdd6378e6610daa8e043e5f156
6
+ metadata.gz: 686956d9960f6445e6f85dcf471b941154d232c7f9a0ff7c13008aed94f6a3a3a328e41abcbcbfa24bbedb5fe04bc481ec9ac31cbabf82806e5b7cb447075f79
7
+ data.tar.gz: 94ea96cb7b1cb71777d4b74a42d868d853b4adbb08185be2035d6658e1e027374ce3d4cc75109dfea49ba8fb95b44d7ae5948f8d4b03a1027c780eac4023e29f
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.6.0
@@ -0,0 +1,32 @@
1
+ # Use plain text file to track cycles.
2
+
3
+ # A specialized class using a plain text to track items stored. It supports
4
+ # three operations: new, <<, and include? . Together these can be used to
5
+ # add items to the text file, then determine whether the item has been added.
6
+ #
7
+ # To use it with Spider use the check_already_seen_with method:
8
+ #
9
+ # Spider.start_at('http://example.com/') do |s|
10
+ # s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
11
+ # end
12
+ class IncludedInFile
13
+ # Construct a new IncludedInFile instance.
14
+ # @param filepath [String] as path of file to store crawled URL
15
+ def initialize(filepath)
16
+ @filepath = filepath
17
+ # create file if not exists
18
+ File.write(@filepath, '') unless File.file?(@filepath)
19
+ @urls = File.readlines(@filepath).map(&:chomp)
20
+ end
21
+
22
+ # Add an item to the file & array of URL.
23
+ def <<(v)
24
+ @urls << v.to_s
25
+ File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
26
+ end
27
+
28
+ # True if the item is in the file.
29
+ def include?(v)
30
+ @urls.include? v.to_s
31
+ end
32
+ end
@@ -1,6 +1,6 @@
1
1
  # Use memcached to track cycles.
2
2
 
3
- require 'memcache'
3
+ require 'dalli'
4
4
 
5
5
  # A specialized class using memcached to track items stored. It supports
6
6
  # three operations: new, <<, and include? . Together these can be used to
@@ -12,10 +12,11 @@ require 'memcache'
12
12
  # s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
13
13
  # end
14
14
  class IncludedInMemcached
15
- # Construct a new IncludedInMemcached instance. All arguments here are
16
- # passed to MemCache (part of the memcache-client gem).
17
- def initialize(*a)
18
- @c = MemCache.new(*a)
15
+ # Construct a new IncludedInMemcached instance. The first argument should be
16
+ # the memcached server address (e.g., 'localhost:11211'). Additional options
17
+ # can be passed as a hash (see Dalli::Client documentation).
18
+ def initialize(server, options = {})
19
+ @c = Dalli::Client.new(server, options)
19
20
  end
20
21
 
21
22
  # Add an item to the memcache.
@@ -165,7 +165,6 @@ class SpiderInstance
165
165
  trap("SIGINT") { @interrupted = true }
166
166
  begin
167
167
  next_urls = @next_urls.pop
168
- tmp_n_u = {}
169
168
  next_urls.each do |prior_url, urls|
170
169
  urls = [urls] unless urls.kind_of?(Array)
171
170
  urls.map do |a_url|
@@ -176,12 +175,9 @@ class SpiderInstance
176
175
  @setup.call(a_url) unless @setup.nil?
177
176
  get_page(parsed_url) do |response|
178
177
  do_callbacks(a_url, response, prior_url)
179
- #tmp_n_u[a_url] = generate_next_urls(a_url, response)
180
- #@next_urls.push tmp_n_u
181
178
  generate_next_urls(a_url, response).each do |a_next_url|
182
179
  @next_urls.push a_url => a_next_url
183
180
  end
184
- #exit if interrupted
185
181
  end
186
182
  @teardown.call(a_url) unless @teardown.nil?
187
183
  break if @interrupted
@@ -256,7 +252,7 @@ class SpiderInstance
256
252
  def do_callbacks(a_url, resp, prior_url) #:nodoc:
257
253
  cbs = [@callbacks[:every],
258
254
  resp.success? ? @callbacks[:success] : @callbacks[:failure],
259
- @callbacks[resp.code]]
255
+ @callbacks[resp.code.to_i]]
260
256
 
261
257
  cbs.each do |cb|
262
258
  cb.call(a_url, resp, prior_url) if cb
@@ -287,14 +283,16 @@ class SpiderInstance
287
283
  case parsed_additional_url.scheme
288
284
  when nil
289
285
  u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
286
+ # Include port if it's not the default port
287
+ port_part = (u.port && ((u.scheme == 'http' && u.port != 80) || (u.scheme == 'https' && u.port != 443))) ? ":#{u.port}" : ""
290
288
  if additional_url[0].chr == '/'
291
- "#{u.scheme}://#{u.host}#{additional_url}"
289
+ "#{u.scheme}://#{u.host}#{port_part}#{additional_url}"
292
290
  elsif u.path.nil? || u.path == ''
293
- "#{u.scheme}://#{u.host}/#{additional_url}"
291
+ "#{u.scheme}://#{u.host}#{port_part}/#{additional_url}"
294
292
  elsif u.path[0].chr == '/'
295
- "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
293
+ "#{u.scheme}://#{u.host}#{port_part}#{u.path}/#{additional_url}"
296
294
  else
297
- "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
295
+ "#{u.scheme}://#{u.host}#{port_part}/#{u.path}/#{additional_url}"
298
296
  end
299
297
  else
300
298
  additional_url
data/lib/spider.rb CHANGED
@@ -4,8 +4,9 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
4
4
  # links, and doing it all over again.
5
5
  class Spider
6
6
 
7
- VERSION_INFO = [0, 5, 3] unless defined?(self::VERSION_INFO)
8
- VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
7
+ VERSION = File.read(
8
+ File.expand_path('../VERSION', __dir__)
9
+ ).strip.freeze
9
10
 
10
11
  def self.version
11
12
  VERSION
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2018-04-23 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: |
14
13
  A Web spidering library: handles robots.txt, scraping, finding more
@@ -18,27 +17,20 @@ executables: []
18
17
  extensions: []
19
18
  extra_rdoc_files: []
20
19
  files:
21
- - AUTHORS
22
- - CHANGES
23
- - LICENSE
24
- - README.md
20
+ - VERSION
25
21
  - lib/spider.rb
22
+ - lib/spider/included_in_file.rb
26
23
  - lib/spider/included_in_memcached.rb
27
24
  - lib/spider/included_in_redis.rb
28
25
  - lib/spider/next_urls_in_sqs.rb
29
26
  - lib/spider/robot_rules.rb
30
27
  - lib/spider/spider_instance.rb
31
- - spec/spec_helper.rb
32
- - spec/spider/included_in_memcached_spec.rb
33
- - spec/spider/included_in_redis_spec.rb
34
- - spec/spider/spider_instance_spec.rb
35
- - spec/spider_spec.rb
36
- - spider.gemspec
37
28
  homepage: https://github.com/johnnagro/spider
38
29
  licenses:
39
30
  - MIT
40
- metadata: {}
41
- post_install_message:
31
+ metadata:
32
+ source_code_uri: https://github.com/johnnagro/spider
33
+ changelog_uri: https://github.com/johnnagro/spider/blob/main/CHANGELOG.md
42
34
  rdoc_options: []
43
35
  require_paths:
44
36
  - lib
@@ -46,16 +38,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
46
38
  requirements:
47
39
  - - ">="
48
40
  - !ruby/object:Gem::Version
49
- version: '0'
41
+ version: '2.5'
50
42
  required_rubygems_version: !ruby/object:Gem::Requirement
51
43
  requirements:
52
44
  - - ">="
53
45
  - !ruby/object:Gem::Version
54
46
  version: '0'
55
47
  requirements: []
56
- rubyforge_project: spider
57
- rubygems_version: 2.5.2.1
58
- signing_key:
48
+ rubygems_version: 3.6.9
59
49
  specification_version: 4
60
50
  summary: A Web spidering library
61
51
  test_files: []
data/AUTHORS DELETED
@@ -1,14 +0,0 @@
1
- The Ruby Spider Gem would not be what it is today without the help of
2
- the following kind souls:
3
-
4
- Brian Campbell
5
- Henri Cook
6
- James Edward Gray II
7
- Jeremy Evans
8
- Joao Eriberto Mota Filho
9
- John Buckley
10
- John Nagro
11
- Matt Horan
12
- Marc (@brigriffin)
13
- Mike Burns (original author)
14
- Sander van der Vliet
data/CHANGES DELETED
@@ -1,68 +0,0 @@
1
- 2018-04-23 v0.5.3
2
- * release simply to add missing CHANGES notes
3
-
4
- 2018-04-23 v0.5.2
5
- * fixed #2 thanks to @jeremyevans
6
- * added Redis as cache wrapper thanks to @brigriffin
7
-
8
- 2016-09-04 v0.5.1
9
- * added the ability to stop a crawl
10
-
11
- 2016-05-13 v0.5.0
12
- * fixed #1 thanks to @eribertomota
13
- * got it running on more recent versions of ruby
14
- * cleaned up the docs a bit
15
- * cleaned up the licensing and attribution
16
-
17
- 2009-05-21
18
- * fixed an issue with robots.txt on ssl hosts
19
- * fixed an issue with pulling robots.txt from disallowed hosts
20
- * fixed a documentation error with ExpiredLinks
21
- * Many thanks to Brian Campbell
22
-
23
- 2008-10-09
24
- * fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley
25
-
26
- 2008-07-06
27
- * Trap interrupts and shutdown gracefully
28
- * Support for custom urls-to-crawl objects
29
- * Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
30
-
31
- 2007-11-09:
32
- * Handle redirects that assume a base URL.
33
-
34
- 2007-11-08:
35
- * Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
36
- spider subdirectory.
37
-
38
- 2007-11-02:
39
- * Memcached support.
40
-
41
- 2007-10-31:
42
- * Add `setup' and `teardown' handlers.
43
- * Can set the headers for a HTTP request.
44
- * Changed :any to :every .
45
- * Changed the arguments to the :every, :success, :failure, and code handler.
46
-
47
- 2007-10-23:
48
- * URLs without a page component but with a query component.
49
- * HTTP Redirect.
50
- * HTTPS.
51
- * Version 0.2.1 .
52
-
53
- 2007-10-22:
54
- * Use RSpec to ensure that it mostly works.
55
- * Use WEBrick to create a small test server for additional testing.
56
- * Completely re-do the API to prepare for future expansion.
57
- * Add the ability to apply each URL to a series of custom allowed?-like
58
- matchers.
59
- * BSD license.
60
- * Version 0.2.0 .
61
-
62
- 2007-03-30:
63
- * Clean up the documentation.
64
-
65
- 2007-03-28:
66
- * Change the tail recursion to a `while' loop, to please Ruby.
67
- * Documentation.
68
- * Initial release: version 0.1.0 .
data/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- The MIT License (MIT)
2
-
3
- Copyright (c) 2007-2016 Spider Team Authors
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
data/README.md DELETED
@@ -1,165 +0,0 @@
1
-
2
- # Spider
3
- _a Web spidering library for Ruby. It handles the robots.txt,
4
- scraping, collecting, and looping so that you can just handle the data._
5
-
6
- ## Examples
7
-
8
- ### Crawl the Web, loading each page in turn, until you run out of memory
9
-
10
- ```ruby
11
- require 'spider'
12
- Spider.start_at('http://cashcats.biz/') {}
13
- ```
14
-
15
- ### To handle erroneous responses
16
-
17
- ```ruby
18
- require 'spider'
19
- Spider.start_at('http://cashcats.biz/') do |s|
20
- s.on :failure do |a_url, resp, prior_url|
21
- puts "URL failed: #{a_url}"
22
- puts " linked from #{prior_url}"
23
- end
24
- end
25
- ```
26
-
27
- ### Or handle successful responses
28
-
29
- ```ruby
30
- require 'spider'
31
- Spider.start_at('http://cashcats.biz/') do |s|
32
- s.on :success do |a_url, resp, prior_url|
33
- puts "#{a_url}: #{resp.code}"
34
- puts resp.body
35
- puts
36
- end
37
- end
38
- ```
39
-
40
- ### Limit to just one domain
41
-
42
- ```ruby
43
- require 'spider'
44
- Spider.start_at('http://cashcats.biz/') do |s|
45
- s.add_url_check do |a_url|
46
- a_url =~ %r{^http://cashcats.biz.*}
47
- end
48
- end
49
- ```
50
-
51
- ### Pass headers to some requests
52
-
53
- ```ruby
54
- require 'spider'
55
- Spider.start_at('http://cashcats.biz/') do |s|
56
- s.setup do |a_url|
57
- if a_url =~ %r{^http://.*wikipedia.*}
58
- headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
59
- end
60
- end
61
- end
62
- ```
63
-
64
- ### Use memcached to track cycles
65
-
66
- ```ruby
67
- require 'spider'
68
- require 'spider/included_in_memcached'
69
- SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
70
- Spider.start_at('http://cashcats.biz/') do |s|
71
- s.check_already_seen_with IncludedInMemcached.new(SERVERS)
72
- end
73
- ```
74
-
75
- ### Use Redis to track cycles
76
-
77
- ```ruby
78
- require 'spider'
79
- require 'spider/included_in_redis'
80
- Spider.start_at('http://cashcats.biz/') do |s|
81
- s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
82
- end
83
- ```
84
-
85
- ### Track cycles with a custom object
86
-
87
- ```ruby
88
- require 'spider'
89
- class ExpireLinks < Hash
90
- def <<(v)
91
- self[v] = Time.now
92
- end
93
- def include?(v)
94
- self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now
95
- end
96
- end
97
-
98
- Spider.start_at('http://cashcats.biz/') do |s|
99
- s.check_already_seen_with ExpireLinks.new
100
- end
101
- ```
102
-
103
- ### Store nodes to visit with Amazon SQS
104
-
105
- ```ruby
106
- require 'spider'
107
- require 'spider/next_urls_in_sqs'
108
- Spider.start_at('http://cashcats.biz') do |s|
109
- s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
110
- end
111
- ```
112
-
113
- ### Store nodes to visit with a custom object
114
-
115
- ```ruby
116
- require 'spider'
117
- class MyArray < Array
118
- def pop
119
- super
120
- end
121
-
122
- def push(a_msg)
123
- super(a_msg)
124
- end
125
- end
126
-
127
- Spider.start_at('http://cashcats.biz') do |s|
128
- s.store_next_urls_with MyArray.new
129
- end
130
- ```
131
-
132
- ### Create a URL graph
133
-
134
- ```ruby
135
- require 'spider'
136
- nodes = {}
137
- Spider.start_at('http://cashcats.biz/') do |s|
138
- s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
139
-
140
- s.on(:every) do |a_url, resp, prior_url|
141
- nodes[prior_url] ||= []
142
- nodes[prior_url] << a_url
143
- end
144
- end
145
- ```
146
-
147
- ### Use a proxy
148
-
149
- ```ruby
150
- require 'net/http_configuration'
151
- require 'spider'
152
- http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
153
- :proxy_port => 8881)
154
- http_conf.apply do
155
- Spider.start_at('http://img.4chan.org/b/') do |s|
156
- s.on(:success) do |a_url, resp, prior_url|
157
- File.open(a_url.gsub('/',':'),'w') do |f|
158
- f.write(resp.body)
159
- end
160
- end
161
- end
162
- end
163
- ```
164
-
165
- _Copyright (c) 2007-2016 Spider Team Authors_
data/spec/spec_helper.rb DELETED
@@ -1,90 +0,0 @@
1
- require 'rubygems'
2
- require 'webrick'
3
- require 'spec'
4
-
5
- Spec::Runner.configure { |c| c.mock_with :mocha }
6
-
7
- def local_require(*files)
8
- files.each do |file|
9
- require File.dirname(__FILE__)+'/../lib/'+file
10
- end
11
- end
12
-
13
- class BeStaticServerPages
14
- def initialize
15
- @pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
16
- @actual = nil
17
- end
18
-
19
- attr :actual, true
20
-
21
- def matches?(actual)
22
- @actual = actual
23
- actual == @pages
24
- end
25
-
26
- def failure_message
27
- "expected #{@pages.inspect}, got #{@actual.inspect}"
28
- end
29
-
30
- def description
31
- "be the pages returned by the static server (#{@pages.inspect})"
32
- end
33
- end
34
-
35
- def with_web_server(svlt)
36
- server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
37
- :AccessLog => [])
38
- server.mount('/', svlt)
39
- Thread.new {server.start}
40
- begin
41
- yield
42
- ensure
43
- server.shutdown
44
- end
45
- end
46
-
47
- def with_memcached
48
- system('memcached -d -P /tmp/spider-memcached.pid')
49
- cacher = IncludedInMemcached.new('localhost:11211')
50
- begin
51
- yield
52
- ensure
53
- system('kill -KILL `cat /tmp/spider-memcached.pid`')
54
- end
55
- end
56
-
57
- def be_static_server_pages
58
- BeStaticServerPages.new
59
- end
60
-
61
- class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
62
- def do_GET(req, res)
63
- res['Content-type'] = 'text/plain'
64
- res.body = "response\n"
65
- end
66
- end
67
-
68
- class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
69
- def do_GET(req, res)
70
- res['Content-type'] = 'text/html'
71
- if req.path == '/foo'
72
- res.body = <<-END
73
- <a href="/">a</a>
74
- END
75
- else
76
- res.body = <<-END
77
- <a href="/foo">b</a>
78
- END
79
- end
80
- end
81
- end
82
-
83
- def null_logger
84
- l = stub
85
- [:log, :fatal, :error, :warn , :info, :debug].each do |k|
86
- l.stubs(k)
87
- l.stubs("#{k}?".to_sym)
88
- end
89
- l
90
- end
@@ -1,43 +0,0 @@
1
- require File.dirname(__FILE__)+'/../spec_helper'
2
-
3
- def before_specing_memcached
4
- local_require 'spider/included_in_memcached'
5
- system('memcached -d -P /tmp/spider-memcached.pid')
6
- end
7
-
8
- def after_specing_memcached
9
- system('kill -KILL `cat /tmp/spider-memcached.pid`')
10
- end
11
-
12
- Spec::Runner.configure { |c| c.mock_with :mocha }
13
-
14
- describe 'Object to halt cycles' do
15
- before do
16
- before_specing_memcached
17
- end
18
-
19
- it 'should understand <<' do
20
- c = IncludedInMemcached.new('localhost:11211')
21
- c.should respond_to(:<<)
22
- end
23
-
24
- it 'should understand included?' do
25
- c = IncludedInMemcached.new('localhost:11211')
26
- c.should respond_to(:include?)
27
- end
28
-
29
- it 'should produce false if the object is not included' do
30
- c = IncludedInMemcached.new('localhost:11211')
31
- c.include?('a').should be_false
32
- end
33
-
34
- it 'should produce true if the object is included' do
35
- c = IncludedInMemcached.new('localhost:11211')
36
- c << 'a'
37
- c.include?('a').should be_true
38
- end
39
-
40
- after do
41
- after_specing_memcached
42
- end
43
- end
@@ -1,43 +0,0 @@
1
- require File.dirname(__FILE__)+'/../spec_helper'
2
-
3
- def before_specing_redis
4
- local_require 'spider/included_in_redis'
5
- system('redis-server 127.0.0.1:6379')
6
- end
7
-
8
- def after_specing_redis
9
- system('kill -KILL `pidof redis-server`')
10
- end
11
-
12
- Spec::Runner.configure { |c| c.mock_with :mocha }
13
-
14
- describe 'Object to halt cycles' do
15
- before do
16
- before_specing_redis
17
- end
18
-
19
- it 'should understand <<' do
20
- c = IncludedInRedis.new(host: 'localhost', port: 6379)
21
- c.should respond_to(:<<)
22
- end
23
-
24
- it 'should understand included?' do
25
- c = IncludedInRedis.new(host: 'localhost', port: 6379)
26
- c.should respond_to(:include?)
27
- end
28
-
29
- it 'should produce false if the object is not included' do
30
- c = IncludedInRedis.new(host: 'localhost', port: 6379)
31
- c.include?('a').should be_false
32
- end
33
-
34
- it 'should produce true if the object is included' do
35
- c = IncludedInRedis.new(host: 'localhost', port: 6379)
36
- c << 'a'
37
- c.include?('a').should be_true
38
- end
39
-
40
- after do
41
- after_specing_redis
42
- end
43
- end
@@ -1,405 +0,0 @@
1
- require File.dirname(__FILE__)+'/../spec_helper'
2
- require 'webrick'
3
- require 'webrick/https'
4
- local_require 'spider', 'spider/included_in_memcached'
5
-
6
- describe 'SpiderInstance' do
7
- # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
8
- # URL. Bug reported by Henri Cook.
9
- it 'should construct a complete redirect URL' do
10
- @response_called = false
11
- redirected_resp = stub(:redirect? => true,
12
- :[] => '/default.htm')
13
- success_resp = stub(:redirect? => false)
14
- http_req = stub(:request => true)
15
- http_mock_redir = stub(:use_ssl= => true)
16
- http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
17
- http_mock_success = stub(:use_ssl= => true)
18
- http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
19
- Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
20
- returns(http_mock_success)
21
- si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
22
- si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
23
- @response_called = true
24
- end
25
- @response_called.should be_true
26
- end
27
-
28
- it 'should prevent cycles with an IncludedInMemcached' do
29
- with_memcached do
30
- cacher = IncludedInMemcached.new('localhost:11211')
31
- it_should_prevent_cycles_with(cacher)
32
- end
33
- end
34
-
35
- it 'should prevent cycles with an Array' do
36
- cacher = Array.new
37
- it_should_prevent_cycles_with(cacher)
38
- end
39
-
40
- it 'should call the "setup" callback before loading the Web page' do
41
- mock_successful_http
42
- @on_called = false
43
- @before_called = false
44
- si = SpiderInstance.new({nil => ['http://example.com/']})
45
- si.stubs(:allowed?).returns(true)
46
- si.stubs(:generate_next_urls).returns([])
47
- si.setup { |*a| @before_called = Time.now }
48
- si.on(:every) { |*a| @on_called = Time.now }
49
- si.start!
50
- @on_called.should_not be_false
51
- @before_called.should_not be_false
52
- @before_called.should_not be_false
53
- @before_called.should < @on_called
54
- end
55
-
56
- it 'should call the "teardown" callback after running all other callbacks' do
57
- mock_successful_http
58
- @on_called = false
59
- @after_called = false
60
- si = SpiderInstance.new({nil => ['http://example.com/']})
61
- si.stubs(:allowed?).returns(true)
62
- si.stubs(:generate_next_urls).returns([])
63
- si.on(:every) { |*a| @on_called = Time.now }
64
- si.teardown { |*a| @after_called = Time.now }
65
- si.start!
66
- @on_called.should_not be_false
67
- @after_called.should_not be_false
68
- @after_called.should_not be_false
69
- @after_called.should > @on_called
70
- end
71
-
72
- it 'should pass headers set by a setup handler to the HTTP request' do
73
- mock_successful_http
74
- Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
75
- si = SpiderInstance.new(nil => ['http://example.com/foo'])
76
- si.stubs(:allowable_url?).returns(true)
77
- si.stubs(:generate_next_urls).returns([])
78
- si.setup do |a_url|
79
- si.headers['X-Header-Set'] = 'True'
80
- end
81
- si.teardown do |a_url|
82
- si.clear_headers
83
- end
84
- si.start!
85
- end
86
-
87
- it 'should call the :every callback with the current URL, the response, and the prior URL' do
88
- mock_successful_http
89
- callback_arguments_on(:every)
90
- end
91
-
92
- it 'should call the :success callback with the current URL, the request, and the prior URL' do
93
- mock_successful_http
94
- callback_arguments_on(:success)
95
- end
96
-
97
- it 'should call the :failure callback with the current URL, the request, and the prior URL' do
98
- mock_failed_http
99
- callback_arguments_on(:failure)
100
- end
101
-
102
- it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
103
- mock_failed_http
104
- callback_arguments_on(404)
105
- end
106
-
107
- it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
108
- mock_successful_http
109
- callback_arguments_on(200)
110
- end
111
-
112
- # Bug reported by John Nagro, using the example source http://eons.com/
113
- # had to change line 192; uses request_uri now instead of path.
114
- it 'should handle query URLs without a path' do
115
- u = 'http://localhost:8888?s=1'
116
- u_p = URI.parse(u)
117
- @block_called = false
118
- with_web_server(QueryServlet) do
119
- si = SpiderInstance.new({nil => [u]})
120
- si.get_page(u_p) do
121
- @block_called = true
122
- end
123
- end
124
- @block_called.should be_true
125
- end
126
-
127
- # This solves a problem reported by John Nagro.
128
- it 'should handle redirects' do
129
- u = 'http://example.com/'
130
- u_p = URI.parse(u)
131
- @redirect_handled = false
132
- mock_redirect_http
133
- si = SpiderInstance.new({nil => [u]})
134
- si.get_page(u_p) do
135
- @redirect_handled = true
136
- end
137
- @redirect_handled.should be_true
138
- end
139
-
140
- it 'should handle HTTPS' do
141
- u = 'https://localhost:10443/'
142
- u_p = URI.parse(u)
143
- @page_called = false
144
- server = WEBrick::HTTPServer.new(:Port => 10443,
145
- :Logger => null_logger,
146
- :AccessLog => [],
147
- :SSLEnable => true,
148
- :SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
149
- :SSLComment => 'Comment of some sort')
150
- server.mount('/', QueryServlet)
151
- Thread.new {server.start}
152
- si = SpiderInstance.new({nil => [u]})
153
- si.get_page(u_p) { @page_called = true }
154
- server.shutdown
155
- @page_called.should be_true
156
- end
157
-
158
- it 'should skip URLs when allowable_url? is false' do
159
- u = 'http://example.com/'
160
- u_p = URI.parse(u)
161
- http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
162
- Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
163
- si = SpiderInstance.new({nil => [u]})
164
- si.expects(:allowable_url?).with(u, u_p).returns(false)
165
- si.expects(:get_page).times(0)
166
- si.start!
167
- end
168
-
169
- it 'should not skip URLs when allowable_url? is true' do
170
- u = 'http://example.com/'
171
- u_p = URI.parse(u)
172
- http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
173
- Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
174
- si = SpiderInstance.new({nil => [u]})
175
- si.expects(:allowable_url?).with(u, u_p).returns(true)
176
- si.expects(:get_page).with(URI.parse(u))
177
- si.start!
178
- end
179
-
180
- it 'should disallow URLs when the robots.txt says to' do
181
- robot_rules = stub
182
- SpiderInstance.any_instance.expects(:open).
183
- with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
184
- 'Accept' => 'text/html,text/xml,application/xml,text/plain').
185
- yields(stub(:read => 'robots.txt content'))
186
- robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
187
- 'robots.txt content')
188
- robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
189
- si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
190
- allowable = si.allowable_url?('http://example.com/',
191
- URI.parse('http://example.com/'))
192
- allowable.should be_false
193
- end
194
-
195
- it 'should disallow URLs when they fail any url_check' do
196
- si = SpiderInstance.new({nil => ['http://example.com/']})
197
- si.stubs(:allowed?).returns(true)
198
- si.add_url_check { |a_url| false }
199
- allowable = si.allowable_url?('http://example.com/',
200
- URI.parse('http://example.com/'))
201
- allowable.should be_false
202
- end
203
-
204
- it 'should support multiple url_checks' do
205
- @first_url_check = false
206
- @second_url_check = false
207
- si = SpiderInstance.new({nil => ['http://example.com/']})
208
- si.stubs(:allowed?).returns(true)
209
- si.add_url_check do |a_url|
210
- @first_url_check = true
211
- true
212
- end
213
- si.add_url_check do |a_url|
214
- @second_url_check = true
215
- false
216
- end
217
- allowable = si.allowable_url?('http://example.com/',
218
- URI.parse('http://example.com/'))
219
- allowable.should be_false
220
- @first_url_check.should be_true
221
- @second_url_check.should be_true
222
- end
223
-
224
- it 'should avoid cycles' do
225
- u = 'http://example.com/'
226
- u_p = URI.parse(u)
227
- si = SpiderInstance.new({nil => [u]}, [u_p])
228
- si.stubs(:allowed?).returns(true)
229
- allowable = si.allowable_url?(u, u_p)
230
- allowable.should be_false
231
- u_p.should_not be_nil
232
- end
233
-
234
- it 'should call the 404 handler for 404s' do
235
- @proc_called = false
236
- mock_failed_http
237
- si = SpiderInstance.new({nil => ['http://example.com/']})
238
- si.stubs(:allowed?).returns(true)
239
- si.stubs(:generate_next_urls).returns([])
240
- si.on(404) {|*a| @proc_called = true}
241
- si.start!
242
- @proc_called.should be_true
243
- end
244
-
245
- it 'should call the :success handler on success' do
246
- @proc_called = false
247
- mock_successful_http
248
- si = SpiderInstance.new({nil => ['http://example.com/']})
249
- si.stubs(:allowed?).returns(true)
250
- si.stubs(:generate_next_urls).returns([])
251
- si.on(:success) {|*a| @proc_called = true}
252
- si.start!
253
- @proc_called.should be_true
254
- end
255
-
256
- it 'should not call the :success handler on failure' do
257
- @proc_called = false
258
- mock_failed_http
259
- si = SpiderInstance.new({nil => ['http://example.com/']})
260
- si.stubs(:allowed?).returns(true)
261
- si.stubs(:generate_next_urls).returns([])
262
- si.on(:success) {|*a| @proc_called = true}
263
- si.start!
264
- @proc_called.should be_false
265
- end
266
-
267
- it 'should call the :success handler and the 200 handler on 200' do
268
- @proc_200_called = false
269
- @proc_success_called = false
270
- mock_successful_http
271
- si = SpiderInstance.new({nil => ['http://example.com/']})
272
- si.stubs(:allowed?).returns(true)
273
- si.stubs(:generate_next_urls).returns([])
274
- si.on(:success) {|*a| @proc_success_called = true}
275
- si.on(200) {|*a| @proc_200_called = true}
276
- si.start!
277
- @proc_200_called.should be_true
278
- @proc_success_called.should be_true
279
- end
280
-
281
- it 'should not call the :failure handler on success' do
282
- @proc_called = false
283
- mock_successful_http
284
- si = SpiderInstance.new({nil => ['http://example.com/']})
285
- si.stubs(:allowed?).returns(true)
286
- si.stubs(:generate_next_urls).returns([])
287
- si.on(:failure) {|*a| @proc_called = true}
288
- si.start!
289
- @proc_called.should be_false
290
- end
291
-
292
- it 'should call the :failure handler on failure' do
293
- @proc_called = false
294
- mock_failed_http
295
- si = SpiderInstance.new({nil => ['http://example.com/']})
296
- si.stubs(:allowed?).returns(true)
297
- si.stubs(:generate_next_urls).returns([])
298
- si.on(:failure) {|*a| @proc_called = true}
299
- si.start!
300
- @proc_called.should be_true
301
- end
302
-
303
- it 'should call the :failure handler and the 404 handler on 404' do
304
- @proc_404_called = false
305
- @proc_failure_called = false
306
- mock_failed_http
307
- si = SpiderInstance.new({nil => ['http://example.com/']})
308
- si.stubs(:allowed?).returns(true)
309
- si.stubs(:generate_next_urls).returns([])
310
- si.on(:failure) {|*a| @proc_failure_called = true}
311
- si.on(404) {|*a| @proc_404_called = true}
312
- si.start!
313
- @proc_404_called.should be_true
314
- @proc_failure_called.should be_true
315
- end
316
-
317
- it 'should call the :every handler even when a handler for the error code is defined' do
318
- @any_called = false
319
- mock_successful_http
320
- si = SpiderInstance.new({nil => ['http://example.com/']})
321
- si.stubs(:allowed?).returns(true)
322
- si.stubs(:generate_next_urls).returns([])
323
- si.on(:every) { |*a| @any_called = true }
324
- si.on(202) {|*a|}
325
- si.start!
326
- @any_called.should be_true
327
- end
328
-
329
- it 'should support a block as a response handler' do
330
- @proc_called = false
331
- mock_successful_http
332
- si = SpiderInstance.new({nil => ['http://example.com/']})
333
- si.stubs(:allowed?).returns(true)
334
- si.stubs(:generate_next_urls).returns([])
335
- si.on(:every) { |*a| @proc_called = true }
336
- si.start!
337
- @proc_called.should be_true
338
- end
339
-
340
- it 'should support a proc as a response handler' do
341
- @proc_called = false
342
- mock_successful_http
343
- si = SpiderInstance.new({nil => ['http://example.com/']})
344
- si.stubs(:allowed?).returns(true)
345
- si.stubs(:generate_next_urls).returns([])
346
- si.on(:every, Proc.new { |*a| @proc_called = true })
347
- si.start!
348
- @proc_called.should be_true
349
- end
350
-
351
- def mock_http(http_req)
352
- http_obj = mock(:use_ssl= => true)
353
- http_obj.expects(:start).
354
- yields(mock(:request => http_req)).returns(http_req)
355
- Net::HTTP.expects(:new).returns(http_obj)
356
- end
357
-
358
- def mock_successful_http
359
- http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
360
- mock_http(http_req)
361
- end
362
-
363
- def mock_failed_http
364
- http_req = stub(:redirect? => false, :success? => false, :code => 404)
365
- mock_http(http_req)
366
- end
367
-
368
- def mock_redirect_http
369
- http_req = stub(:redirect? => true, :success? => false, :code => 404)
370
- http_req.expects(:[]).with('Location').returns('http://example.com/')
371
- http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
372
- http_obj = mock(:use_ssl= => true)
373
- http_obj.expects(:start).
374
- yields(mock(:request => http_req)).returns(http_req)
375
- http_obj2 = mock(:use_ssl= => true)
376
- http_obj2.expects(:start).
377
- yields(mock(:request => http_req2)).returns(http_req2)
378
- Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
379
- end
380
-
381
- def callback_arguments_on(code)
382
- si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
383
- si.stubs(:allowed?).returns(true)
384
- si.stubs(:generate_next_urls).returns([])
385
- si.on(code) do |a_url, resp, prior_url|
386
- a_url.should == 'http://example.com/'
387
- resp.should_not be_nil
388
- prior_url.should == 'http://foo.com/'
389
- end
390
- si.start!
391
- end
392
-
393
- def it_should_prevent_cycles_with(cacher)
394
- u = 'http://localhost:8888/'
395
- u_p = URI.parse(u)
396
- u2 = 'http://localhost:8888/foo'
397
- u_p2 = URI.parse(u2)
398
-
399
- with_web_server(LoopingServlet) do
400
- si = SpiderInstance.new(nil => [u])
401
- si.check_already_seen_with cacher
402
- si.start!
403
- end
404
- end
405
- end
data/spec/spider_spec.rb DELETED
@@ -1,33 +0,0 @@
1
- require File.dirname(__FILE__)+'/spec_helper'
2
- local_require 'spider', 'spider/included_in_memcached'
3
-
4
- describe 'Spider' do
5
- it 'should find two pages without cycles using defaults' do
6
- u = []
7
- with_web_server(LoopingServlet) do
8
- u = find_pages_with_static_server
9
- end
10
- u.should be_static_server_pages
11
- end
12
-
13
- it 'should find two pages without cycles using memcached' do
14
- u = []
15
- with_web_server(LoopingServlet) do
16
- with_memcached do
17
- u = find_pages_with_static_server do |s|
18
- s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
19
- end
20
- end
21
- end
22
- u.should be_static_server_pages
23
- end
24
-
25
- def find_pages_with_static_server(&block)
26
- pages = []
27
- Spider.start_at('http://localhost:8888/') do |s|
28
- block.call(s) unless block.nil?
29
- s.on(:every){ |u,r,p| pages << u }
30
- end
31
- pages
32
- end
33
- end
data/spider.gemspec DELETED
@@ -1,21 +0,0 @@
1
- require 'rubygems'
2
-
3
- require File.expand_path('../lib/spider', __FILE__)
4
-
5
- spec = Gem::Specification.new do |s|
6
- s.author = 'John Nagro'
7
- s.email = 'john.nagro@gmail.com'
8
- s.license = 'MIT'
9
- s.has_rdoc = true
10
- s.homepage = 'https://github.com/johnnagro/spider'
11
- s.name = 'spider'
12
- s.rubyforge_project = 'spider'
13
- s.summary = 'A Web spidering library'
14
- s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
15
- s.require_path = 'lib'
16
- s.description = <<-EOF
17
- A Web spidering library: handles robots.txt, scraping, finding more
18
- links, and doing it all over again.
19
- EOF
20
- s.version = Spider::VERSION
21
- end