rawler 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +2 -9
- data/lib/rawler.rb +1 -1
- data/lib/rawler/base.rb +9 -7
- data/lib/rawler/crawler.rb +6 -2
- data/spec/lib/rawler/crawler_spec.rb +16 -3
- data/spec/lib/rawler_spec.rb +16 -5
- metadata +3 -3
data/README.txt
CHANGED
@@ -13,8 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
13
13
|
rawler http://example.com [options]
|
14
14
|
|
15
15
|
where [options] are:
|
16
|
-
--username, -u <s>:
|
17
|
-
--password, -p <s>:
|
16
|
+
--username, -u <s>: HTTP Basic Username
|
17
|
+
--password, -p <s>: HTTP Basic Password
|
18
18
|
--version, -v: Print version and exit
|
19
19
|
--help, -h: Show this message
|
20
20
|
|
@@ -40,13 +40,6 @@ If you add files, run:
|
|
40
40
|
|
41
41
|
And add them to the Manifest file.
|
42
42
|
|
43
|
-
== TODO
|
44
|
-
|
45
|
-
* Add logger levels
|
46
|
-
* Follow redirects, but still inform about them
|
47
|
-
* Respect robots.txt
|
48
|
-
* Export to html
|
49
|
-
|
50
43
|
== CONTRIBUTORS:
|
51
44
|
|
52
45
|
* bcoob
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -36,14 +36,16 @@ module Rawler
|
|
36
36
|
|
37
37
|
def add_status_code(link, from_url)
|
38
38
|
response = Rawler::Request.get(link)
|
39
|
+
|
40
|
+
validate_page(response['Location'], from_url) if response['Location']
|
39
41
|
|
40
|
-
record_response(response.code, link, from_url)
|
42
|
+
record_response(response.code, link, from_url, response['Location'])
|
41
43
|
responses[link] = { :status => response.code.to_i }
|
42
44
|
rescue Errno::ECONNREFUSED
|
43
|
-
|
45
|
+
error("Connection refused - #{link} - Called from: #{from_url}")
|
44
46
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
45
47
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
|
46
|
-
|
48
|
+
error("Connection problems - #{link} - Called from: #{from_url}")
|
47
49
|
end
|
48
50
|
|
49
51
|
def same_domain?(link)
|
@@ -54,18 +56,18 @@ module Rawler
|
|
54
56
|
responses[link].nil?
|
55
57
|
end
|
56
58
|
|
57
|
-
def
|
58
|
-
# TODO: This may not always be an error message,
|
59
|
-
# but that will make it show up most of the time
|
59
|
+
def error(message)
|
60
60
|
Rawler.output.error(message)
|
61
61
|
end
|
62
62
|
|
63
|
-
def record_response(code, link, from_url)
|
63
|
+
def record_response(code, link, from_url, redirection=nil)
|
64
64
|
message = "#{code} - #{link}"
|
65
65
|
|
66
66
|
if code.to_i >= 300
|
67
67
|
message += " - Called from: #{from_url}"
|
68
68
|
end
|
69
|
+
|
70
|
+
message += " - Following redirection to: #{redirection}" if redirection
|
69
71
|
|
70
72
|
code = code.to_i
|
71
73
|
case code / 100
|
data/lib/rawler/crawler.rb
CHANGED
@@ -19,10 +19,10 @@ module Rawler
|
|
19
19
|
|
20
20
|
doc = Nokogiri::HTML(response.body)
|
21
21
|
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
22
|
-
rescue Errno::ECONNREFUSED
|
22
|
+
rescue Errno::ECONNREFUSED # TODO: add called from
|
23
23
|
write("Couldn't connect to #{url}")
|
24
24
|
[]
|
25
|
-
rescue Errno::ETIMEDOUT
|
25
|
+
rescue Errno::ETIMEDOUT # TODO: add called from
|
26
26
|
write("Connection to #{url} timed out")
|
27
27
|
[]
|
28
28
|
end
|
@@ -38,8 +38,12 @@ module Rawler
|
|
38
38
|
else
|
39
39
|
path
|
40
40
|
end
|
41
|
+
rescue URI::InvalidURIError
|
42
|
+
write("Invalid url: #{path} - Called from: #{url}")
|
43
|
+
nil
|
41
44
|
end
|
42
45
|
|
46
|
+
# TODO: add 'called from in a more pragmatic way as an optional parameter
|
43
47
|
def write(message)
|
44
48
|
Rawler.output.error(message)
|
45
49
|
end
|
@@ -5,7 +5,7 @@ require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
|
5
5
|
describe Rawler::Crawler do
|
6
6
|
|
7
7
|
let(:url) { 'http://example.com' }
|
8
|
-
let(:output)
|
8
|
+
let(:output) { double('output', :error => nil) }
|
9
9
|
|
10
10
|
before(:each) do
|
11
11
|
Rawler.stub!(:url).and_return(url)
|
@@ -164,11 +164,9 @@ describe Rawler::Crawler do
|
|
164
164
|
|
165
165
|
let(:url) { 'http://example.com' }
|
166
166
|
let(:crawler) { Rawler::Crawler.new(url) }
|
167
|
-
let(:output) { double('output', :error => nil) }
|
168
167
|
|
169
168
|
before(:each) do
|
170
169
|
register(url, '')
|
171
|
-
Rawler.stub!(:output).and_return(output)
|
172
170
|
end
|
173
171
|
|
174
172
|
context "Errno::ECONNREFUSED" do
|
@@ -250,5 +248,20 @@ describe Rawler::Crawler do
|
|
250
248
|
crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
|
251
249
|
end
|
252
250
|
end
|
251
|
+
|
252
|
+
context "invalid urls" do
|
253
|
+
let(:content) { '<a href="http://foo;bar">foo</a>' }
|
254
|
+
let(:url) { 'http://example.com' }
|
255
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
256
|
+
|
257
|
+
before(:each) do
|
258
|
+
register(url, content)
|
259
|
+
end
|
260
|
+
|
261
|
+
it "should notify about the invalid url" do
|
262
|
+
output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
|
263
|
+
crawler.links.should == []
|
264
|
+
end
|
265
|
+
end
|
253
266
|
|
254
267
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -53,16 +53,27 @@ describe Rawler::Base do
|
|
53
53
|
register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
|
54
54
|
register('http://example.com/foo2', '')
|
55
55
|
register('http://external.com', '')
|
56
|
-
register('http://external.com/foo', '',
|
56
|
+
register('http://external.com/foo', '', 301)
|
57
57
|
|
58
58
|
output.should_receive(:info).with('200 - http://example.com/foo1')
|
59
59
|
output.should_receive(:info).with('200 - http://example.com/foo2')
|
60
60
|
output.should_receive(:info).with('200 - http://external.com')
|
61
|
-
output.should_receive(:warn).with('
|
61
|
+
output.should_receive(:warn).with('301 - http://external.com/foo - Called from: http://example.com/foo1')
|
62
62
|
|
63
63
|
rawler.validate
|
64
64
|
end
|
65
65
|
|
66
|
+
it "should follow redirections but inform about them" do
|
67
|
+
register('http://example.com', '<a href="/foo">foo</a>')
|
68
|
+
register('http://example.com/foo', '', 301, :location => 'http://example.com/bar')
|
69
|
+
register('http://example.com/bar', '')
|
70
|
+
|
71
|
+
output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/bar')
|
72
|
+
output.should_receive(:info).with('200 - http://example.com/bar')
|
73
|
+
|
74
|
+
rawler.validate
|
75
|
+
end
|
76
|
+
|
66
77
|
end
|
67
78
|
|
68
79
|
describe "get_status_code" do
|
@@ -77,14 +88,14 @@ describe Rawler::Base do
|
|
77
88
|
rawler.responses[url][:status].should == 200
|
78
89
|
end
|
79
90
|
|
80
|
-
it "should add to
|
91
|
+
it "should add to 301 links" do
|
81
92
|
url = 'http://example.com/foo'
|
82
93
|
from = 'http://other.com'
|
83
|
-
register(url, '',
|
94
|
+
register(url, '', 301)
|
84
95
|
|
85
96
|
rawler.send(:add_status_code, url, from)
|
86
97
|
|
87
|
-
rawler.responses[url][:status].should ==
|
98
|
+
rawler.responses[url][:status].should == 301
|
88
99
|
end
|
89
100
|
|
90
101
|
it "should save username and password" do
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 9
|
9
|
+
version: 0.0.9
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Oscar Del Ben
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-25 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|