rawler 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +9 -0
- data/Gemfile.lock +22 -0
- data/Manifest.txt +7 -7
- data/README.txt +26 -4
- data/Rakefile +11 -0
- data/bin/rawler +2 -1
- data/lib/rawler.rb +13 -2
- data/lib/rawler/base.rb +27 -6
- data/lib/rawler/crawler.rb +24 -7
- data/spec/lib/base_spec.rb +5 -0
- data/spec/lib/rawler/base_spec.rb +0 -0
- data/spec/lib/rawler/crawler_spec.rb +214 -0
- data/spec/{unit/base_spec.rb → lib/rawler_spec.rb} +79 -8
- data/spec/spec_helper.rb +1 -1
- data/specs.watchr +59 -0
- metadata +11 -13
- data/.autotest +0 -23
- data/spec/unit/crawler/base_spec.rb +0 -75
- data/spec/unit/crawler/content_type_spec.rb +0 -23
- data/spec/unit/crawler/exceptions_spec.rb +0 -54
- data/spec/unit/crawler/http_basic_spec.rb +0 -25
- data/spec/unit/crawler/url_domain_spec.rb +0 -26
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.2)
|
5
|
+
fakeweb (1.3.0)
|
6
|
+
nokogiri (1.4.4)
|
7
|
+
rspec (2.4.0)
|
8
|
+
rspec-core (~> 2.4.0)
|
9
|
+
rspec-expectations (~> 2.4.0)
|
10
|
+
rspec-mocks (~> 2.4.0)
|
11
|
+
rspec-core (2.4.0)
|
12
|
+
rspec-expectations (2.4.0)
|
13
|
+
diff-lcs (~> 1.1.2)
|
14
|
+
rspec-mocks (2.4.0)
|
15
|
+
|
16
|
+
PLATFORMS
|
17
|
+
ruby
|
18
|
+
|
19
|
+
DEPENDENCIES
|
20
|
+
fakeweb (= 1.3.0)
|
21
|
+
nokogiri (= 1.4.4)
|
22
|
+
rspec (= 2.4.0)
|
data/Manifest.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
Gemfile
|
2
|
+
Gemfile.lock
|
2
3
|
History.txt
|
3
4
|
Manifest.txt
|
4
5
|
README.txt
|
@@ -10,13 +11,12 @@ lib/rawler/core_extensions.rb
|
|
10
11
|
lib/rawler/core_extensions/module.rb
|
11
12
|
lib/rawler/crawler.rb
|
12
13
|
lib/rawler/request.rb
|
14
|
+
spec/lib/base_spec.rb
|
15
|
+
spec/lib/rawler/base_spec.rb
|
16
|
+
spec/lib/rawler/crawler_spec.rb
|
17
|
+
spec/lib/rawler_spec.rb
|
13
18
|
spec/spec.opts
|
14
19
|
spec/spec_helper.rb
|
15
|
-
|
16
|
-
spec/unit/crawler/base_spec.rb
|
17
|
-
spec/unit/crawler/content_type_spec.rb
|
18
|
-
spec/unit/crawler/exceptions_spec.rb
|
19
|
-
spec/unit/crawler/http_basic_spec.rb
|
20
|
-
spec/unit/crawler/url_domain_spec.rb
|
20
|
+
specs.watchr
|
21
21
|
tasks/rspec.rake
|
22
22
|
vendor/lib-trollop.rb
|
data/README.txt
CHANGED
@@ -8,8 +8,6 @@ Rawler is a Ruby library that crawls your website and checks the status code for
|
|
8
8
|
|
9
9
|
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
10
10
|
|
11
|
-
Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
|
12
|
-
|
13
11
|
== SYNOPSIS:
|
14
12
|
|
15
13
|
rawler http://example.com [options]
|
@@ -24,16 +22,40 @@ Please note: I had to temporarily remove url encoding in order to resolve some i
|
|
24
22
|
|
25
23
|
gem install rawler
|
26
24
|
|
25
|
+
== DEVELOPMENT:
|
26
|
+
|
27
|
+
Run bundle install to install everything you need
|
28
|
+
|
29
|
+
rake test
|
30
|
+
|
31
|
+
To package and run the gem locally:
|
32
|
+
|
33
|
+
rake package
|
34
|
+
cd pkg
|
35
|
+
gem install rawler-#{version}.gem
|
36
|
+
|
37
|
+
If you add files, run:
|
38
|
+
|
39
|
+
rake check_manifest
|
40
|
+
|
41
|
+
And add them to the Manifest file.
|
42
|
+
|
27
43
|
== TODO
|
28
44
|
|
45
|
+
* Add logger levels
|
29
46
|
* Follow redirects, but still inform about them
|
30
47
|
* Respect robots.txt
|
31
48
|
* Export to html
|
32
49
|
|
33
50
|
== CONTRIBUTORS:
|
34
51
|
|
35
|
-
*
|
52
|
+
* bcoob
|
36
53
|
* Hugh Sasse
|
54
|
+
* Ken Egozi
|
55
|
+
* Robert Glaser
|
56
|
+
* Vesa Vänskä
|
57
|
+
|
58
|
+
See also https://github.com/oscardelben/rawler/contributors
|
37
59
|
|
38
60
|
== LICENSE:
|
39
61
|
|
@@ -58,4 +80,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
58
80
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
59
81
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
60
82
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
61
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
83
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
CHANGED
@@ -3,6 +3,12 @@
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'hoe'
|
5
5
|
|
6
|
+
# require 'bundler'
|
7
|
+
# Bundler::GemHelper.install_tasks
|
8
|
+
|
9
|
+
require 'rspec/core/rake_task'
|
10
|
+
RSpec::Core::RakeTask.new(:test)
|
11
|
+
|
6
12
|
# Hoe.plugin :compiler
|
7
13
|
# Hoe.plugin :gem_prelude_sucks
|
8
14
|
# Hoe.plugin :inline
|
@@ -21,4 +27,9 @@ Hoe.spec 'rawler' do
|
|
21
27
|
extra_deps << ['nokogiri']
|
22
28
|
end
|
23
29
|
|
30
|
+
desc 'Console'
|
31
|
+
task :console do
|
32
|
+
exec 'irb -rubygems -I lib -r rawler.rb'
|
33
|
+
end
|
34
|
+
|
24
35
|
# vim: syntax=ruby
|
data/bin/rawler
CHANGED
data/lib/rawler.rb
CHANGED
@@ -2,11 +2,12 @@ require 'rubygems'
|
|
2
2
|
require 'net/http'
|
3
3
|
require 'net/https'
|
4
4
|
require 'nokogiri'
|
5
|
+
require 'logger'
|
5
6
|
|
6
7
|
require 'rawler/core_extensions'
|
7
8
|
|
8
9
|
module Rawler
|
9
|
-
VERSION = '0.0.
|
10
|
+
VERSION = '0.0.6'
|
10
11
|
|
11
12
|
mattr_accessor :output
|
12
13
|
mattr_accessor :url
|
@@ -16,4 +17,14 @@ module Rawler
|
|
16
17
|
autoload :Base, "rawler/base"
|
17
18
|
autoload :Crawler, "rawler/crawler"
|
18
19
|
autoload :Request, "rawler/request"
|
19
|
-
|
20
|
+
|
21
|
+
def self.url=(url)
|
22
|
+
url.strip!
|
23
|
+
|
24
|
+
if (url =~ /http:\/\//) != 0
|
25
|
+
url = 'http://' + url
|
26
|
+
end
|
27
|
+
|
28
|
+
@@url = url
|
29
|
+
end
|
30
|
+
end
|
data/lib/rawler/base.rb
CHANGED
@@ -8,7 +8,7 @@ module Rawler
|
|
8
8
|
@responses = {}
|
9
9
|
|
10
10
|
Rawler.url = url
|
11
|
-
Rawler.output = output
|
11
|
+
Rawler.output = Logger.new(output)
|
12
12
|
Rawler.username = username
|
13
13
|
Rawler.password = password
|
14
14
|
end
|
@@ -37,13 +37,13 @@ module Rawler
|
|
37
37
|
def add_status_code(link)
|
38
38
|
response = Rawler::Request.get(link)
|
39
39
|
|
40
|
-
|
40
|
+
record_response(response.code, link)
|
41
41
|
responses[link] = { :status => response.code.to_i }
|
42
42
|
rescue Errno::ECONNREFUSED
|
43
|
-
|
43
|
+
Rawler.output.error("Connection refused - '#{link}'")
|
44
44
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
45
45
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
46
|
-
|
46
|
+
Rawler.output.error("Connection problems - '#{link}'")
|
47
47
|
end
|
48
48
|
|
49
49
|
def same_domain?(link)
|
@@ -55,9 +55,30 @@ module Rawler
|
|
55
55
|
end
|
56
56
|
|
57
57
|
def write(message)
|
58
|
-
|
58
|
+
# TODO: This may not always be an error message,
|
59
|
+
# but that will make it show up most of the time
|
60
|
+
Rawler.output.error(message)
|
61
|
+
end
|
62
|
+
|
63
|
+
def record_response(code, link)
|
64
|
+
message = "#{code} - #{link}"
|
65
|
+
code = code.to_i
|
66
|
+
case code / 100
|
67
|
+
when 1
|
68
|
+
# TODO: check that if a 100 is received
|
69
|
+
# then there is another status code as well
|
70
|
+
Rawler.output.info(message)
|
71
|
+
when 2 then
|
72
|
+
Rawler.output.info(message)
|
73
|
+
when 3 then
|
74
|
+
Rawler.output.warn(message)
|
75
|
+
when 4,5 then
|
76
|
+
Rawler.output.error(message)
|
77
|
+
else
|
78
|
+
Rawler.output.error("Unknown code #{message}")
|
79
|
+
end
|
59
80
|
end
|
60
81
|
|
61
82
|
end
|
62
83
|
|
63
|
-
end
|
84
|
+
end
|
data/lib/rawler/crawler.rb
CHANGED
@@ -5,7 +5,7 @@ module Rawler
|
|
5
5
|
attr_accessor :url, :links
|
6
6
|
|
7
7
|
def initialize(url)
|
8
|
-
@url = url
|
8
|
+
@url = url.strip
|
9
9
|
end
|
10
10
|
|
11
11
|
def links
|
@@ -16,7 +16,7 @@ module Rawler
|
|
16
16
|
response = Rawler::Request.get(url)
|
17
17
|
|
18
18
|
doc = Nokogiri::HTML(response.body)
|
19
|
-
doc.css('a').map { |a|
|
19
|
+
doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
20
20
|
rescue Errno::ECONNREFUSED
|
21
21
|
write("Couldn't connect to #{url}")
|
22
22
|
[]
|
@@ -28,11 +28,16 @@ module Rawler
|
|
28
28
|
private
|
29
29
|
|
30
30
|
def absolute_url(path)
|
31
|
-
|
31
|
+
path.strip!
|
32
|
+
if path[0].chr == '/'
|
33
|
+
URI.parse(url).merge(path.to_s).to_s
|
34
|
+
else
|
35
|
+
path
|
36
|
+
end
|
32
37
|
end
|
33
38
|
|
34
39
|
def write(message)
|
35
|
-
Rawler.output.
|
40
|
+
Rawler.output.error(message)
|
36
41
|
end
|
37
42
|
|
38
43
|
def different_domain?(url_1, url_2)
|
@@ -44,11 +49,23 @@ module Rawler
|
|
44
49
|
end
|
45
50
|
|
46
51
|
def valid_url?(url)
|
52
|
+
return false unless url
|
53
|
+
|
54
|
+
url.strip!
|
47
55
|
scheme = URI.parse(url).scheme
|
56
|
+
|
57
|
+
if ['http', 'https'].include?(scheme)
|
58
|
+
true
|
59
|
+
else
|
60
|
+
write("Invalid url - #{url}")
|
61
|
+
false
|
62
|
+
end
|
48
63
|
|
49
|
-
|
64
|
+
rescue URI::InvalidURIError
|
65
|
+
false
|
66
|
+
write("Invalid url - #{url}")
|
50
67
|
end
|
51
|
-
|
68
|
+
|
52
69
|
end
|
53
70
|
|
54
|
-
end
|
71
|
+
end
|
File without changes
|
@@ -0,0 +1,214 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
let(:url) { 'http://example.com' }
|
6
|
+
let(:output) { double("output", :error => nil) }
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
Rawler.stub!(:url).and_return(url)
|
10
|
+
Rawler.stub!(:output).and_return(output)
|
11
|
+
end
|
12
|
+
|
13
|
+
context "basic functionality" do
|
14
|
+
|
15
|
+
let(:url) { 'http://example.com' }
|
16
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
17
|
+
let(:content) {
|
18
|
+
content = <<-content
|
19
|
+
<p><a href="http://example.com/foo">foo</a></p>
|
20
|
+
|
21
|
+
<p><a href="http://external.com/bar">bar</a></p>
|
22
|
+
content
|
23
|
+
}
|
24
|
+
|
25
|
+
before(:each) do
|
26
|
+
register(url, content)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should parse all links" do
|
30
|
+
crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
context "relative paths" do
|
36
|
+
|
37
|
+
let(:url) { 'http://example.com/path' }
|
38
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
39
|
+
let(:content) { '<a href="/foo">foo</a>' }
|
40
|
+
|
41
|
+
before(:each) do
|
42
|
+
register(url, content)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should parse relative links" do
|
46
|
+
crawler.links.should == ['http://example.com/foo']
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
context "different domains" do
|
52
|
+
|
53
|
+
let(:url) { 'http://external.com/path' }
|
54
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
55
|
+
let(:content) { '<a href="/foo">foo</a>' }
|
56
|
+
|
57
|
+
before(:each) do
|
58
|
+
Rawler.stub!(:url).and_return('http://example.com')
|
59
|
+
register(url, content)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should parse relative links" do
|
63
|
+
crawler.links.should == []
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
context "urls with hash tags" do
|
69
|
+
|
70
|
+
let(:url) { 'http://example.com/path' }
|
71
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
72
|
+
let(:content) { '<a href="/foo#bar">foo</a>' }
|
73
|
+
|
74
|
+
before(:each) do
|
75
|
+
register(url, content)
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should parse relative links" do
|
79
|
+
crawler.links.should == ['http://example.com/foo#bar']
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
context "invalid urls" do
|
85
|
+
let(:url) { 'http://example.com/path' }
|
86
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
87
|
+
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
88
|
+
let(:content) { "<a href=\"#{js_url}\">foo</a>" }
|
89
|
+
|
90
|
+
before(:each) do
|
91
|
+
register(url, content)
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should parse relative links" do
|
95
|
+
crawler.links.should == []
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should report the error" do
|
99
|
+
crawler.should_receive(:write).with("Invalid url - #{js_url}")
|
100
|
+
crawler.links
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
context "content type" do
|
106
|
+
|
107
|
+
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
108
|
+
|
109
|
+
let(:url) { 'http://example.com' }
|
110
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
111
|
+
|
112
|
+
before(:each) do
|
113
|
+
register(url, '', 200, :content_type => content_type)
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should ignore '#{content_type}'" do
|
117
|
+
crawler.links.should == []
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
context "Exceptions" do
|
124
|
+
|
125
|
+
let(:url) { 'http://example.com' }
|
126
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
127
|
+
let(:output) { double('output', :error => nil) }
|
128
|
+
|
129
|
+
before(:each) do
|
130
|
+
register(url, '')
|
131
|
+
Rawler.stub!(:output).and_return(output)
|
132
|
+
end
|
133
|
+
|
134
|
+
context "Errno::ECONNREFUSED" do
|
135
|
+
|
136
|
+
before(:each) do
|
137
|
+
Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
|
138
|
+
end
|
139
|
+
|
140
|
+
it "should return an empty array" do
|
141
|
+
crawler.links.should == []
|
142
|
+
end
|
143
|
+
|
144
|
+
it "should print a message when raising Errno::ECONNREFUSED" do
|
145
|
+
output.should_receive(:error).with("Couldn't connect to #{url}")
|
146
|
+
|
147
|
+
crawler.links
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
context "Errno::ETIMEDOUT" do
|
153
|
+
|
154
|
+
before(:each) do
|
155
|
+
Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should return an empty array when raising Errno::ETIMEDOUT" do
|
159
|
+
crawler.links.should == []
|
160
|
+
end
|
161
|
+
|
162
|
+
it "should print a message when raising Errno::ETIMEDOUT" do
|
163
|
+
output.should_receive(:error).with("Connection to #{url} timed out")
|
164
|
+
|
165
|
+
crawler.links
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
context "http basic" do
|
173
|
+
|
174
|
+
let(:url) { 'http://example.com' }
|
175
|
+
let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
|
176
|
+
let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
|
177
|
+
|
178
|
+
before(:each) do
|
179
|
+
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
180
|
+
register('http://foo:bar@example.com/secret', content)
|
181
|
+
|
182
|
+
Rawler.stub!(:username).and_return('foo')
|
183
|
+
Rawler.stub!(:password).and_return('bar')
|
184
|
+
end
|
185
|
+
|
186
|
+
it "should crawl http basic pages" do
|
187
|
+
crawler.links.should == ['http://example.com/secret-path']
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
|
192
|
+
context "url domain" do
|
193
|
+
|
194
|
+
let(:content) {
|
195
|
+
content = <<-content
|
196
|
+
<a href="http://example.com/valid">foo</a>
|
197
|
+
<a href="mailto:info@example.com">invalid</a>
|
198
|
+
<a href="https://foo.com">valid</a>
|
199
|
+
<a href=" http://fooo.com ">valid with illegal whitespaces</a>
|
200
|
+
content
|
201
|
+
}
|
202
|
+
let(:url) { 'http://example.com' }
|
203
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
204
|
+
|
205
|
+
before(:each) do
|
206
|
+
register(url, content)
|
207
|
+
end
|
208
|
+
|
209
|
+
it "should ignore links other than http or https" do
|
210
|
+
crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|
@@ -6,6 +6,7 @@ describe Rawler::Base do
|
|
6
6
|
let(:rawler) { Rawler::Base.new('http://example.com', output) }
|
7
7
|
|
8
8
|
before(:each) do
|
9
|
+
Rawler.stub!(:output).and_return(output)
|
9
10
|
register('http://example.com', site)
|
10
11
|
end
|
11
12
|
|
@@ -42,10 +43,10 @@ describe Rawler::Base do
|
|
42
43
|
register('http://external.com', '')
|
43
44
|
register('http://external.com/foo', '', 302)
|
44
45
|
|
45
|
-
output.should_receive(:
|
46
|
-
output.should_receive(:
|
47
|
-
output.should_receive(:
|
48
|
-
output.should_receive(:
|
46
|
+
output.should_receive(:info).with('200 - http://example.com/foo1')
|
47
|
+
output.should_receive(:info).with('200 - http://example.com/foo2')
|
48
|
+
output.should_receive(:info).with('200 - http://external.com')
|
49
|
+
output.should_receive(:warn).with('302 - http://external.com/foo')
|
49
50
|
|
50
51
|
rawler.validate
|
51
52
|
end
|
@@ -54,7 +55,7 @@ describe Rawler::Base do
|
|
54
55
|
register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
|
55
56
|
register('http://example.com/page-with', '')
|
56
57
|
|
57
|
-
output.should_receive(:
|
58
|
+
output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
|
58
59
|
|
59
60
|
rawler.validate
|
60
61
|
end
|
@@ -93,7 +94,7 @@ describe Rawler::Base do
|
|
93
94
|
|
94
95
|
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
95
96
|
|
96
|
-
output.should_receive(:
|
97
|
+
output.should_receive(:error).with("Connection refused - '#{url}'")
|
97
98
|
|
98
99
|
rawler.send(:add_status_code, url)
|
99
100
|
end
|
@@ -105,7 +106,7 @@ describe Rawler::Base do
|
|
105
106
|
|
106
107
|
Rawler::Request.should_receive(:get).and_raise error
|
107
108
|
|
108
|
-
output.should_receive(:
|
109
|
+
output.should_receive(:error).with("Connection problems - '#{url}'")
|
109
110
|
|
110
111
|
rawler.send(:add_status_code, url)
|
111
112
|
end
|
@@ -113,6 +114,76 @@ describe Rawler::Base do
|
|
113
114
|
|
114
115
|
end
|
115
116
|
|
117
|
+
describe "record_response" do
|
118
|
+
|
119
|
+
let(:message) { 'foo' }
|
120
|
+
|
121
|
+
context "response code 100" do
|
122
|
+
%w!100, 150, 199!.each do |code|
|
123
|
+
|
124
|
+
it "logger should receive info" do
|
125
|
+
output.should_receive(:info).with("#{code} - #{message}")
|
126
|
+
rawler.send(:record_response, code, message)
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
context "response code 200" do
|
133
|
+
%w!200, 250, 299!.each do |code|
|
134
|
+
|
135
|
+
it "logger should receive info" do
|
136
|
+
output.should_receive(:info).with("#{code} - #{message}")
|
137
|
+
rawler.send(:record_response, code, message)
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
context "response code 300" do
|
144
|
+
%w!300, 350, 399!.each do |code|
|
145
|
+
|
146
|
+
it "logger should receive warn" do
|
147
|
+
output.should_receive(:warn).with("#{code} - #{message}")
|
148
|
+
rawler.send(:record_response, code, message)
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context "response code 400" do
|
155
|
+
%w!400, 450, 499!.each do |code|
|
156
|
+
|
157
|
+
it "logger should receive info" do
|
158
|
+
output.should_receive(:error).with("#{code} - #{message}")
|
159
|
+
rawler.send(:record_response, code, message)
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
context "response code 500" do
|
166
|
+
%w!400, 550, 599!.each do |code|
|
167
|
+
|
168
|
+
it "logger should receive info" do
|
169
|
+
output.should_receive(:error).with("#{code} - #{message}")
|
170
|
+
rawler.send(:record_response, code, message)
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
context "response code invalid" do
|
177
|
+
let(:code) { 600 }
|
178
|
+
|
179
|
+
it "logger should receive eror" do
|
180
|
+
output.should_receive(:error).with("Unknown code #{code} - #{message}")
|
181
|
+
rawler.send(:record_response, code, message)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
116
187
|
|
117
188
|
private
|
118
189
|
|
@@ -129,4 +200,4 @@ describe Rawler::Base do
|
|
129
200
|
site
|
130
201
|
end
|
131
202
|
|
132
|
-
end
|
203
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/specs.watchr
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# Run me with:
|
2
|
+
#
|
3
|
+
# $ watchr specs.watchr
|
4
|
+
|
5
|
+
# --------------------------------------------------
|
6
|
+
# Convenience Methods
|
7
|
+
# --------------------------------------------------
|
8
|
+
def all_test_files
|
9
|
+
Dir['spec/**/*_spec.rb']
|
10
|
+
end
|
11
|
+
|
12
|
+
def run_test_matching(thing_to_match)
|
13
|
+
matches = all_test_files.grep(/#{thing_to_match}/i)
|
14
|
+
if matches.empty?
|
15
|
+
puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
|
16
|
+
else
|
17
|
+
run matches.join(' ')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def run(files_to_run)
|
22
|
+
puts("Running: #{files_to_run}")
|
23
|
+
system("clear;rspec -cfs #{files_to_run}")
|
24
|
+
no_int_for_you
|
25
|
+
end
|
26
|
+
|
27
|
+
def run_all_tests
|
28
|
+
puts "foo"
|
29
|
+
run(all_test_files.join(' '))
|
30
|
+
end
|
31
|
+
|
32
|
+
# --------------------------------------------------
|
33
|
+
# Watchr Rules
|
34
|
+
# --------------------------------------------------
|
35
|
+
watch('^spec/(.*)_spec\.rb' ) { |m| run_test_matching(m[1]) }
|
36
|
+
watch('^lib/(.*)\.rb' ) { |m| run_test_matching(m[1]) }
|
37
|
+
watch('^spec/spec_helper\.rb') { run_all_tests }
|
38
|
+
# --------------------------------------------------
|
39
|
+
# Signal Handling
|
40
|
+
# --------------------------------------------------
|
41
|
+
|
42
|
+
def no_int_for_you
|
43
|
+
@sent_an_int = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
Signal.trap 'INT' do
|
47
|
+
if @sent_an_int then
|
48
|
+
puts " A second INT? Ok, I get the message. Shutting down now."
|
49
|
+
exit
|
50
|
+
else
|
51
|
+
puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
|
52
|
+
@sent_an_int = true
|
53
|
+
Kernel.sleep 1.5
|
54
|
+
run_all_tests
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# vim:ft=ruby
|
59
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 6
|
10
|
+
version: 0.0.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-02-04 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -52,8 +52,6 @@ description: |-
|
|
52
52
|
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
53
53
|
|
54
54
|
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
55
|
-
|
56
|
-
Please note: I had to temporarily remove url encoding in order to resolve some issues, so if you find any issue, please let me know. I'm also going to use Mechanizer for parsing pages with the next release.
|
57
55
|
email:
|
58
56
|
- info@oscardelben.com
|
59
57
|
executables:
|
@@ -65,7 +63,8 @@ extra_rdoc_files:
|
|
65
63
|
- Manifest.txt
|
66
64
|
- README.txt
|
67
65
|
files:
|
68
|
-
-
|
66
|
+
- Gemfile
|
67
|
+
- Gemfile.lock
|
69
68
|
- History.txt
|
70
69
|
- Manifest.txt
|
71
70
|
- README.txt
|
@@ -77,14 +76,13 @@ files:
|
|
77
76
|
- lib/rawler/core_extensions/module.rb
|
78
77
|
- lib/rawler/crawler.rb
|
79
78
|
- lib/rawler/request.rb
|
79
|
+
- spec/lib/base_spec.rb
|
80
|
+
- spec/lib/rawler/base_spec.rb
|
81
|
+
- spec/lib/rawler/crawler_spec.rb
|
82
|
+
- spec/lib/rawler_spec.rb
|
80
83
|
- spec/spec.opts
|
81
84
|
- spec/spec_helper.rb
|
82
|
-
-
|
83
|
-
- spec/unit/crawler/base_spec.rb
|
84
|
-
- spec/unit/crawler/content_type_spec.rb
|
85
|
-
- spec/unit/crawler/exceptions_spec.rb
|
86
|
-
- spec/unit/crawler/http_basic_spec.rb
|
87
|
-
- spec/unit/crawler/url_domain_spec.rb
|
85
|
+
- specs.watchr
|
88
86
|
- tasks/rspec.rake
|
89
87
|
- vendor/lib-trollop.rb
|
90
88
|
has_rdoc: true
|
data/.autotest
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
# -*- ruby -*-
|
2
|
-
|
3
|
-
require 'autotest/restart'
|
4
|
-
|
5
|
-
# Autotest.add_hook :initialize do |at|
|
6
|
-
# at.extra_files << "../some/external/dependency.rb"
|
7
|
-
#
|
8
|
-
# at.libs << ":../some/external"
|
9
|
-
#
|
10
|
-
# at.add_exception 'vendor'
|
11
|
-
#
|
12
|
-
# at.add_mapping(/dependency.rb/) do |f, _|
|
13
|
-
# at.files_matching(/test_.*rb$/)
|
14
|
-
# end
|
15
|
-
#
|
16
|
-
# %w(TestA TestB).each do |klass|
|
17
|
-
# at.extra_class_map[klass] = "test/test_misc.rb"
|
18
|
-
# end
|
19
|
-
# end
|
20
|
-
|
21
|
-
# Autotest.add_hook :run_command do |at|
|
22
|
-
# system "rake build"
|
23
|
-
# end
|
@@ -1,75 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
-
|
3
|
-
describe Rawler::Crawler do
|
4
|
-
|
5
|
-
context "basic functionality" do
|
6
|
-
|
7
|
-
let(:url) { 'http://example.com' }
|
8
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
9
|
-
let(:content) {
|
10
|
-
content = <<-content
|
11
|
-
<p><a href="http://example.com/foo">foo</a></p>
|
12
|
-
|
13
|
-
<p><a href="http://external.com/bar">bar</a></p>
|
14
|
-
content
|
15
|
-
}
|
16
|
-
|
17
|
-
before(:each) do
|
18
|
-
register(url, content)
|
19
|
-
end
|
20
|
-
|
21
|
-
it "should parse all links" do
|
22
|
-
crawler.links.should == ['http://example.com/foo', 'http://external.com/bar']
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
context "relative paths" do
|
28
|
-
|
29
|
-
let(:url) { 'http://example.com/path' }
|
30
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
31
|
-
let(:content) { '<a href="/foo">foo</a>' }
|
32
|
-
|
33
|
-
before(:each) do
|
34
|
-
register(url, content)
|
35
|
-
end
|
36
|
-
|
37
|
-
it "should parse relative links" do
|
38
|
-
crawler.links.should == ['http://example.com/foo']
|
39
|
-
end
|
40
|
-
|
41
|
-
end
|
42
|
-
|
43
|
-
context "different domains" do
|
44
|
-
|
45
|
-
let(:url) { 'http://external.com/path' }
|
46
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
47
|
-
let(:content) { '<a href="/foo">foo</a>' }
|
48
|
-
|
49
|
-
before(:each) do
|
50
|
-
register(url, content)
|
51
|
-
end
|
52
|
-
|
53
|
-
it "should parse relative links" do
|
54
|
-
crawler.links.should == []
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
context "urls with hash tags" do
|
60
|
-
|
61
|
-
let(:url) { 'http://example.com/path' }
|
62
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
63
|
-
let(:content) { '<a href="/foo#bar">foo</a>' }
|
64
|
-
|
65
|
-
before(:each) do
|
66
|
-
register(url, content)
|
67
|
-
end
|
68
|
-
|
69
|
-
it "should parse relative links" do
|
70
|
-
crawler.links.should == ['http://example.com/foo#bar']
|
71
|
-
end
|
72
|
-
|
73
|
-
end
|
74
|
-
|
75
|
-
end
|
@@ -1,23 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
-
|
3
|
-
describe Rawler::Crawler do
|
4
|
-
|
5
|
-
context "content type" do
|
6
|
-
|
7
|
-
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
8
|
-
|
9
|
-
let(:url) { 'http://example.com' }
|
10
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
11
|
-
|
12
|
-
before(:each) do
|
13
|
-
register(url, '', 200, :content_type => content_type)
|
14
|
-
end
|
15
|
-
|
16
|
-
it "should ignore '#{content_type}'" do
|
17
|
-
crawler.links.should == []
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
-
|
3
|
-
describe Rawler::Crawler do
|
4
|
-
|
5
|
-
context "Exceptions" do
|
6
|
-
|
7
|
-
let(:url) { 'http://example.com' }
|
8
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
9
|
-
let(:output) { double('output', :puts => nil) }
|
10
|
-
|
11
|
-
before(:each) do
|
12
|
-
register(url, '')
|
13
|
-
Rawler.stub!(:output).and_return(output)
|
14
|
-
end
|
15
|
-
|
16
|
-
context "Errno::ECONNREFUSED" do
|
17
|
-
|
18
|
-
before(:each) do
|
19
|
-
Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should return an empty array" do
|
23
|
-
crawler.links.should == []
|
24
|
-
end
|
25
|
-
|
26
|
-
it "should print a message when raising Errno::ECONNREFUSED" do
|
27
|
-
output.should_receive(:puts).with("Couldn't connect to #{url}")
|
28
|
-
|
29
|
-
crawler.links
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
context "Errno::ETIMEDOUT" do
|
35
|
-
|
36
|
-
before(:each) do
|
37
|
-
Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT
|
38
|
-
end
|
39
|
-
|
40
|
-
it "should return an empty array when raising Errno::ETIMEDOUT" do
|
41
|
-
crawler.links.should == []
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should print a message when raising Errno::ETIMEDOUT" do
|
45
|
-
output.should_receive(:puts).with("Connection to #{url} timed out")
|
46
|
-
|
47
|
-
crawler.links
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
@@ -1,25 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
-
|
3
|
-
describe Rawler::Crawler do
|
4
|
-
|
5
|
-
context "http basic" do
|
6
|
-
|
7
|
-
let(:url) { 'http://example.com' }
|
8
|
-
let(:content) { '<a href="http://example.com/secret-path">foo</a>' }
|
9
|
-
let(:crawler) { Rawler::Crawler.new('http://example.com/secret') }
|
10
|
-
|
11
|
-
before(:each) do
|
12
|
-
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
13
|
-
register('http://foo:bar@example.com/secret', content)
|
14
|
-
|
15
|
-
Rawler.stub!(:username).and_return('foo')
|
16
|
-
Rawler.stub!(:password).and_return('bar')
|
17
|
-
end
|
18
|
-
|
19
|
-
it "should crawl http basic pages" do
|
20
|
-
crawler.links.should == ['http://example.com/secret-path']
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
2
|
-
|
3
|
-
describe Rawler::Crawler do
|
4
|
-
|
5
|
-
context "url domain" do
|
6
|
-
|
7
|
-
let(:content) {
|
8
|
-
content = <<-content
|
9
|
-
<a href="http://example.com/valid">foo</a>
|
10
|
-
<a href="mailto:info@example.com">invalid</a>
|
11
|
-
<a href="https://foo.com">valid</a>
|
12
|
-
content
|
13
|
-
}
|
14
|
-
let(:url) { 'http://example.com' }
|
15
|
-
let(:crawler) { Rawler::Crawler.new(url) }
|
16
|
-
|
17
|
-
before(:each) do
|
18
|
-
register(url, content)
|
19
|
-
end
|
20
|
-
|
21
|
-
it "should ignore links other than http or https" do
|
22
|
-
crawler.links.should == ['http://example.com/valid', 'https://foo.com']
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|