rawler 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +23 -0
- data/Manifest.txt +8 -3
- data/README.txt +10 -1
- data/lib/rawler.rb +1 -1
- data/lib/rawler/base.rb +5 -3
- data/lib/rawler/crawler.rb +6 -3
- data/lib/rawler/request.rb +1 -1
- data/spec/spec_helper.rb +8 -0
- data/spec/unit/base_spec.rb +10 -1
- data/spec/unit/crawler/base_spec.rb +75 -0
- data/spec/unit/crawler/content_type_spec.rb +23 -0
- data/spec/unit/crawler/exceptions_spec.rb +54 -0
- data/spec/unit/crawler/http_basic_spec.rb +25 -0
- data/spec/unit/crawler/url_domain_spec.rb +26 -0
- metadata +15 -8
- data/spec/unit/crawler_spec.rb +0 -114
data/.autotest
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'autotest/restart'
|
4
|
+
|
5
|
+
# Autotest.add_hook :initialize do |at|
|
6
|
+
# at.extra_files << "../some/external/dependency.rb"
|
7
|
+
#
|
8
|
+
# at.libs << ":../some/external"
|
9
|
+
#
|
10
|
+
# at.add_exception 'vendor'
|
11
|
+
#
|
12
|
+
# at.add_mapping(/dependency.rb/) do |f, _|
|
13
|
+
# at.files_matching(/test_.*rb$/)
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# %w(TestA TestB).each do |klass|
|
17
|
+
# at.extra_class_map[klass] = "test/test_misc.rb"
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Autotest.add_hook :run_command do |at|
|
22
|
+
# system "rake build"
|
23
|
+
# end
|
data/Manifest.txt
CHANGED
@@ -1,17 +1,22 @@
|
|
1
|
+
.autotest
|
1
2
|
History.txt
|
2
3
|
Manifest.txt
|
3
4
|
README.txt
|
4
5
|
Rakefile
|
5
6
|
bin/rawler
|
7
|
+
lib/rawler.rb
|
6
8
|
lib/rawler/base.rb
|
7
|
-
lib/rawler/core_extensions/module.rb
|
8
9
|
lib/rawler/core_extensions.rb
|
10
|
+
lib/rawler/core_extensions/module.rb
|
9
11
|
lib/rawler/crawler.rb
|
10
12
|
lib/rawler/request.rb
|
11
|
-
lib/rawler.rb
|
12
13
|
spec/spec.opts
|
13
14
|
spec/spec_helper.rb
|
14
15
|
spec/unit/base_spec.rb
|
15
|
-
spec/unit/
|
16
|
+
spec/unit/crawler/base_spec.rb
|
17
|
+
spec/unit/crawler/content_type_spec.rb
|
18
|
+
spec/unit/crawler/exceptions_spec.rb
|
19
|
+
spec/unit/crawler/http_basic_spec.rb
|
20
|
+
spec/unit/crawler/url_domain_spec.rb
|
16
21
|
tasks/rspec.rake
|
17
22
|
vendor/lib-trollop.rb
|
data/README.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= rawler
|
2
2
|
|
3
|
-
* http://github.com
|
3
|
+
* http://github.com/oscardelben/rawler
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
@@ -8,6 +8,8 @@ Rawler is a Ruby library that crawls your website and checks the status code for
|
|
8
8
|
|
9
9
|
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
10
10
|
|
11
|
+
Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
|
12
|
+
|
11
13
|
== SYNOPSIS:
|
12
14
|
|
13
15
|
rawler http://example.com [options]
|
@@ -24,8 +26,15 @@ gem install rawler
|
|
24
26
|
|
25
27
|
== TODO
|
26
28
|
|
29
|
+
* Follow redirects, but still inform about them
|
30
|
+
* Respect robots.txt
|
27
31
|
* Export to html
|
28
32
|
|
33
|
+
== CONTRIBUTORS:
|
34
|
+
|
35
|
+
* Vesa Vänskä https://github.com/vesan
|
36
|
+
* Hugh Sasse
|
37
|
+
|
29
38
|
== LICENSE:
|
30
39
|
|
31
40
|
(The MIT License)
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -22,6 +22,8 @@ module Rawler
|
|
22
22
|
def validate_links_in_page(current_url)
|
23
23
|
Rawler::Crawler.new(current_url).links.each do |page_url|
|
24
24
|
validate_page(page_url)
|
25
|
+
# Todo: include this in a configuration option
|
26
|
+
sleep(3)
|
25
27
|
end
|
26
28
|
end
|
27
29
|
|
@@ -39,13 +41,13 @@ module Rawler
|
|
39
41
|
responses[link] = { :status => response.code.to_i }
|
40
42
|
rescue Errno::ECONNREFUSED
|
41
43
|
write("Connection refused - '#{link}'")
|
42
|
-
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
43
|
-
|
44
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
45
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
44
46
|
write("Connection problems - '#{link}'")
|
45
47
|
end
|
46
48
|
|
47
49
|
def same_domain?(link)
|
48
|
-
URI.parse(
|
50
|
+
URI.parse(Rawler.url).host == URI.parse(link).host
|
49
51
|
end
|
50
52
|
|
51
53
|
def not_yet_parsed?(link)
|
data/lib/rawler/crawler.rb
CHANGED
@@ -20,12 +20,15 @@ module Rawler
|
|
20
20
|
rescue Errno::ECONNREFUSED
|
21
21
|
write("Couldn't connect to #{url}")
|
22
22
|
[]
|
23
|
+
rescue Errno::ETIMEDOUT
|
24
|
+
write("Connection to #{url} timed out")
|
25
|
+
[]
|
23
26
|
end
|
24
27
|
|
25
28
|
private
|
26
29
|
|
27
30
|
def absolute_url(path)
|
28
|
-
URI.parse(
|
31
|
+
URI.parse(url).merge(path.to_s).to_s
|
29
32
|
end
|
30
33
|
|
31
34
|
def write(message)
|
@@ -33,7 +36,7 @@ module Rawler
|
|
33
36
|
end
|
34
37
|
|
35
38
|
def different_domain?(url_1, url_2)
|
36
|
-
URI.parse(
|
39
|
+
URI.parse(url_1).host != URI.parse(url_2).host
|
37
40
|
end
|
38
41
|
|
39
42
|
def not_html?(url)
|
@@ -41,7 +44,7 @@ module Rawler
|
|
41
44
|
end
|
42
45
|
|
43
46
|
def valid_url?(url)
|
44
|
-
scheme = URI.parse(
|
47
|
+
scheme = URI.parse(url).scheme
|
45
48
|
|
46
49
|
['http', 'https'].include?(scheme)
|
47
50
|
end
|
data/lib/rawler/request.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
data/spec/unit/base_spec.rb
CHANGED
@@ -49,6 +49,15 @@ describe Rawler::Base do
|
|
49
49
|
|
50
50
|
rawler.validate
|
51
51
|
end
|
52
|
+
|
53
|
+
it "should validate links with #hashtags" do
|
54
|
+
register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
|
55
|
+
register('http://example.com/page-with', '')
|
56
|
+
|
57
|
+
output.should_receive(:puts).with('200 - http://example.com/page-with#hashtag')
|
58
|
+
|
59
|
+
rawler.validate
|
60
|
+
end
|
52
61
|
|
53
62
|
end
|
54
63
|
|
@@ -89,7 +98,7 @@ describe Rawler::Base do
|
|
89
98
|
rawler.send(:add_status_code, url)
|
90
99
|
end
|
91
100
|
|
92
|
-
[Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
101
|
+
[Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
|
93
102
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
|
94
103
|
it "should rescue from #{error}" do
|
95
104
|
url = 'http://example.com'
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
context "basic functionality" do
|
6
|
+
|
7
|
+
let(:url) { 'http://example.com' }
|
8
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
9
|
+
let(:content) {
|
10
|
+
content = <<-content
|
11
|
+
<p><a href="http://example.com/foo">foo</a></p>
|
12
|
+
|
13
|
+
<p><a href="http://external.com/bar">bar</a></p>
|
14
|
+
content
|
15
|
+
}
|
16
|
+
|
17
|
+
before(:each) do
|
18
|
+
register(url, content)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should parse all links" do
|
22
|
+
crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
context "relative paths" do
|
28
|
+
|
29
|
+
let(:url) { 'http://example.com/path' }
|
30
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
31
|
+
let(:content) { '<a href="/foo">foo</a>' }
|
32
|
+
|
33
|
+
before(:each) do
|
34
|
+
register(url, content)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should parse relative links" do
|
38
|
+
crawler.links.should == ['http://example.com/foo']
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
context "different domains" do
|
44
|
+
|
45
|
+
let(:url) { 'http://external.com/path' }
|
46
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
47
|
+
let(:content) { '<a href="/foo">foo</a>' }
|
48
|
+
|
49
|
+
before(:each) do
|
50
|
+
register(url, content)
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should parse relative links" do
|
54
|
+
crawler.links.should == []
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
context "urls with hash tags" do
|
60
|
+
|
61
|
+
let(:url) { 'http://example.com/path' }
|
62
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
63
|
+
let(:content) { '<a href="/foo#bar">foo</a>' }
|
64
|
+
|
65
|
+
before(:each) do
|
66
|
+
register(url, content)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should parse relative links" do
|
70
|
+
crawler.links.should == ['http://example.com/foo#bar']
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
context "content type" do
|
6
|
+
|
7
|
+
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
8
|
+
|
9
|
+
let(:url) { 'http://example.com' }
|
10
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
11
|
+
|
12
|
+
before(:each) do
|
13
|
+
register(url, '', 200, :content_type => content_type)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should ignore '#{content_type}'" do
|
17
|
+
crawler.links.should == []
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
context "Exceptions" do
|
6
|
+
|
7
|
+
let(:url) { 'http://example.com' }
|
8
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
9
|
+
let(:output) { double('output', :puts => nil) }
|
10
|
+
|
11
|
+
before(:each) do
|
12
|
+
register(url, '')
|
13
|
+
Rawler.stub!(:output).and_return(output)
|
14
|
+
end
|
15
|
+
|
16
|
+
context "Errno::ECONNREFUSED" do
|
17
|
+
|
18
|
+
before(:each) do
|
19
|
+
Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return an empty array" do
|
23
|
+
crawler.links.should == []
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should print a message when raising Errno::ECONNREFUSED" do
|
27
|
+
output.should_receive(:puts).with("Couldn't connect to #{url}")
|
28
|
+
|
29
|
+
crawler.links
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
context "Errno::ETIMEDOUT" do
|
35
|
+
|
36
|
+
before(:each) do
|
37
|
+
Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should return an empty array when raising Errno::ETIMEDOUT" do
|
41
|
+
crawler.links.should == []
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should print a message when raising Errno::ETIMEDOUT" do
|
45
|
+
output.should_receive(:puts).with("Connection to #{url} timed out")
|
46
|
+
|
47
|
+
crawler.links
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
context "http basic" do
|
6
|
+
|
7
|
+
let(:url) { 'http://example.com' }
|
8
|
+
let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
|
9
|
+
let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
|
10
|
+
|
11
|
+
before(:each) do
|
12
|
+
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
13
|
+
register('http://foo:bar@example.com/secret', content)
|
14
|
+
|
15
|
+
Rawler.stub!(:username).and_return('foo')
|
16
|
+
Rawler.stub!(:password).and_return('bar')
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should crawl http basic pages" do
|
20
|
+
crawler.links.should == ['http://example.com/secret-path']
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
context "url domain" do
|
6
|
+
|
7
|
+
let(:content) {
|
8
|
+
content = <<-content
|
9
|
+
<a href="http://example.com/valid">foo</a>
|
10
|
+
<a href="mailto:info@example.com">invalid</a>
|
11
|
+
<a href="https://foo.com">valid</a>
|
12
|
+
content
|
13
|
+
}
|
14
|
+
let(:url) { 'http://example.com' }
|
15
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
16
|
+
|
17
|
+
before(:each) do
|
18
|
+
register(url, content)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should ignore links other than http or https" do
|
22
|
+
crawler.links.should == ['http://example.com/valid', 'https://foo.com']
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-21 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -52,6 +52,8 @@ description: |-
|
|
52
52
|
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
53
53
|
|
54
54
|
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
55
|
+
|
56
|
+
Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
|
55
57
|
email:
|
56
58
|
- info@oscardelben.com
|
57
59
|
executables:
|
@@ -63,25 +65,30 @@ extra_rdoc_files:
|
|
63
65
|
- Manifest.txt
|
64
66
|
- README.txt
|
65
67
|
files:
|
68
|
+
- .autotest
|
66
69
|
- History.txt
|
67
70
|
- Manifest.txt
|
68
71
|
- README.txt
|
69
72
|
- Rakefile
|
70
73
|
- bin/rawler
|
74
|
+
- lib/rawler.rb
|
71
75
|
- lib/rawler/base.rb
|
72
|
-
- lib/rawler/core_extensions/module.rb
|
73
76
|
- lib/rawler/core_extensions.rb
|
77
|
+
- lib/rawler/core_extensions/module.rb
|
74
78
|
- lib/rawler/crawler.rb
|
75
79
|
- lib/rawler/request.rb
|
76
|
-
- lib/rawler.rb
|
77
80
|
- spec/spec.opts
|
78
81
|
- spec/spec_helper.rb
|
79
82
|
- spec/unit/base_spec.rb
|
80
|
-
- spec/unit/
|
83
|
+
- spec/unit/crawler/base_spec.rb
|
84
|
+
- spec/unit/crawler/content_type_spec.rb
|
85
|
+
- spec/unit/crawler/exceptions_spec.rb
|
86
|
+
- spec/unit/crawler/http_basic_spec.rb
|
87
|
+
- spec/unit/crawler/url_domain_spec.rb
|
81
88
|
- tasks/rspec.rake
|
82
89
|
- vendor/lib-trollop.rb
|
83
90
|
has_rdoc: true
|
84
|
-
homepage: http://github.com
|
91
|
+
homepage: http://github.com/oscardelben/rawler
|
85
92
|
licenses: []
|
86
93
|
|
87
94
|
post_install_message:
|
data/spec/unit/crawler_spec.rb
DELETED
@@ -1,114 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
-
|
3
|
-
describe Rawler::Crawler do
|
4
|
-
|
5
|
-
let(:url) { 'http://example.com' }
|
6
|
-
|
7
|
-
before(:each) do
|
8
|
-
Rawler.stub!(:url).and_return(url)
|
9
|
-
end
|
10
|
-
|
11
|
-
it "should parse all links" do
|
12
|
-
register(url, site)
|
13
|
-
|
14
|
-
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
|
15
|
-
end
|
16
|
-
|
17
|
-
it "should parse relative links" do
|
18
|
-
url = 'http://example.com/path'
|
19
|
-
register(url, '<a href="/foo">foo</a>')
|
20
|
-
|
21
|
-
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
22
|
-
end
|
23
|
-
|
24
|
-
it "should parse links only if the page is in the same domain as the main url" do
|
25
|
-
url = 'http://external.com/path'
|
26
|
-
register(url, '<a href="/foo">foo</a>')
|
27
|
-
|
28
|
-
Rawler.should_receive(:url).and_return('http://example.com')
|
29
|
-
|
30
|
-
Rawler::Crawler.new(url).links.should == []
|
31
|
-
end
|
32
|
-
|
33
|
-
it "should return an empty array when raising Errno::ECONNREFUSED" do
|
34
|
-
register(url, site)
|
35
|
-
crawler = Rawler::Crawler.new(url)
|
36
|
-
|
37
|
-
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
38
|
-
|
39
|
-
crawler.links.should == []
|
40
|
-
end
|
41
|
-
|
42
|
-
it "should print a message when raising Errno::ECONNREFUSED" do
|
43
|
-
output = double('output')
|
44
|
-
register(url, site)
|
45
|
-
|
46
|
-
crawler = Rawler::Crawler.new(url)
|
47
|
-
|
48
|
-
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
49
|
-
Rawler.should_receive(:output).and_return(output)
|
50
|
-
output.should_receive(:puts).with("Couldn't connect to #{url}")
|
51
|
-
|
52
|
-
crawler.links
|
53
|
-
end
|
54
|
-
|
55
|
-
context "should ignore content type other than text/html" do
|
56
|
-
|
57
|
-
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
58
|
-
|
59
|
-
it "should ignore '#{content_type}'" do
|
60
|
-
register(url, site, 200, :content_type => content_type)
|
61
|
-
|
62
|
-
crawler = Rawler::Crawler.new(url)
|
63
|
-
crawler.links.should == []
|
64
|
-
end
|
65
|
-
|
66
|
-
end
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
it "should ignore links other than http or https" do
|
71
|
-
content = <<-content
|
72
|
-
<a href="http://example.com/valid">foo</a>
|
73
|
-
<a href="mailto:info@example.com">invalid</a>
|
74
|
-
<a href="https://foo.com">valid</a>
|
75
|
-
content
|
76
|
-
|
77
|
-
register(url, content)
|
78
|
-
|
79
|
-
crawler = Rawler::Crawler.new(url)
|
80
|
-
crawler.links.should == ['http://example.com/valid', 'https://foo.com']
|
81
|
-
end
|
82
|
-
|
83
|
-
it "should crawl http basic pages" do
|
84
|
-
content = '<a href="http://example.com/secret-path">foo</a>'
|
85
|
-
|
86
|
-
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
87
|
-
register('http://foo:bar@example.com/secret', content)
|
88
|
-
|
89
|
-
Rawler.stub!(:username).and_return('foo')
|
90
|
-
Rawler.stub!(:password).and_return('bar')
|
91
|
-
|
92
|
-
crawler = Rawler::Crawler.new('http://example.com/secret')
|
93
|
-
crawler.links.should == ['http://example.com/secret-path']
|
94
|
-
end
|
95
|
-
|
96
|
-
private
|
97
|
-
|
98
|
-
def site
|
99
|
-
<<-site
|
100
|
-
<!DOCTYPE html>
|
101
|
-
<html>
|
102
|
-
<body>
|
103
|
-
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
|
104
|
-
|
105
|
-
<p><a href="http://example.com/foo">foo</a></p>
|
106
|
-
|
107
|
-
<p><a href="http://external.com/bar">bar</a></p>
|
108
|
-
|
109
|
-
</body>
|
110
|
-
</html>
|
111
|
-
site
|
112
|
-
end
|
113
|
-
|
114
|
-
end
|