rawler 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +2 -9
- data/lib/rawler.rb +1 -1
- data/lib/rawler/base.rb +9 -7
- data/lib/rawler/crawler.rb +6 -2
- data/spec/lib/rawler/crawler_spec.rb +16 -3
- data/spec/lib/rawler_spec.rb +16 -5
- metadata +3 -3
data/README.txt
CHANGED
@@ -13,8 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
13
13
|
rawler http://example.com [options]
|
14
14
|
|
15
15
|
where [options] are:
|
16
|
-
--username, -u <s>:
|
17
|
-
--password, -p <s>:
|
16
|
+
--username, -u <s>: HTTP Basic Username
|
17
|
+
--password, -p <s>: HTTP Basic Password
|
18
18
|
--version, -v: Print version and exit
|
19
19
|
--help, -h: Show this message
|
20
20
|
|
@@ -40,13 +40,6 @@ If you add files, run:
|
|
40
40
|
|
41
41
|
And add them to the Manifest file.
|
42
42
|
|
43
|
-
== TODO
|
44
|
-
|
45
|
-
* Add logger levels
|
46
|
-
* Follow redirects, but still inform about them
|
47
|
-
* Respect robots.txt
|
48
|
-
* Export to html
|
49
|
-
|
50
43
|
== CONTRIBUTORS:
|
51
44
|
|
52
45
|
* bcoob
|
data/lib/rawler.rb
CHANGED
data/lib/rawler/base.rb
CHANGED
@@ -36,14 +36,16 @@ module Rawler
|
|
36
36
|
|
37
37
|
def add_status_code(link, from_url)
|
38
38
|
response = Rawler::Request.get(link)
|
39
|
+
|
40
|
+
validate_page(response['Location'], from_url) if response['Location']
|
39
41
|
|
40
|
-
record_response(response.code, link, from_url)
|
42
|
+
record_response(response.code, link, from_url, response['Location'])
|
41
43
|
responses[link] = { :status => response.code.to_i }
|
42
44
|
rescue Errno::ECONNREFUSED
|
43
|
-
|
45
|
+
error("Connection refused - #{link} - Called from: #{from_url}")
|
44
46
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
45
47
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
|
46
|
-
|
48
|
+
error("Connection problems - #{link} - Called from: #{from_url}")
|
47
49
|
end
|
48
50
|
|
49
51
|
def same_domain?(link)
|
@@ -54,18 +56,18 @@ module Rawler
|
|
54
56
|
responses[link].nil?
|
55
57
|
end
|
56
58
|
|
57
|
-
def
|
58
|
-
# TODO: This may not always be an error message,
|
59
|
-
# but that will make it show up most of the time
|
59
|
+
def error(message)
|
60
60
|
Rawler.output.error(message)
|
61
61
|
end
|
62
62
|
|
63
|
-
def record_response(code, link, from_url)
|
63
|
+
def record_response(code, link, from_url, redirection=nil)
|
64
64
|
message = "#{code} - #{link}"
|
65
65
|
|
66
66
|
if code.to_i >= 300
|
67
67
|
message += " - Called from: #{from_url}"
|
68
68
|
end
|
69
|
+
|
70
|
+
message += " - Following redirection to: #{redirection}" if redirection
|
69
71
|
|
70
72
|
code = code.to_i
|
71
73
|
case code / 100
|
data/lib/rawler/crawler.rb
CHANGED
@@ -19,10 +19,10 @@ module Rawler
|
|
19
19
|
|
20
20
|
doc = Nokogiri::HTML(response.body)
|
21
21
|
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
22
|
-
rescue Errno::ECONNREFUSED
|
22
|
+
rescue Errno::ECONNREFUSED # TODO: add called from
|
23
23
|
write("Couldn't connect to #{url}")
|
24
24
|
[]
|
25
|
-
rescue Errno::ETIMEDOUT
|
25
|
+
rescue Errno::ETIMEDOUT # TODO: add called from
|
26
26
|
write("Connection to #{url} timed out")
|
27
27
|
[]
|
28
28
|
end
|
@@ -38,8 +38,12 @@ module Rawler
|
|
38
38
|
else
|
39
39
|
path
|
40
40
|
end
|
41
|
+
rescue URI::InvalidURIError
|
42
|
+
write("Invalid url: #{path} - Called from: #{url}")
|
43
|
+
nil
|
41
44
|
end
|
42
45
|
|
46
|
+
# TODO: add 'called from in a more pragmatic way as an optional parameter
|
43
47
|
def write(message)
|
44
48
|
Rawler.output.error(message)
|
45
49
|
end
|
@@ -5,7 +5,7 @@ require File.dirname(__FILE__) + '/../../spec_helper.rb'
|
|
5
5
|
describe Rawler::Crawler do
|
6
6
|
|
7
7
|
let(:url) { 'http://example.com' }
|
8
|
-
let(:output)
|
8
|
+
let(:output) { double('output', :error => nil) }
|
9
9
|
|
10
10
|
before(:each) do
|
11
11
|
Rawler.stub!(:url).and_return(url)
|
@@ -164,11 +164,9 @@ describe Rawler::Crawler do
|
|
164
164
|
|
165
165
|
let(:url) { 'http://example.com' }
|
166
166
|
let(:crawler) { Rawler::Crawler.new(url) }
|
167
|
-
let(:output) { double('output', :error => nil) }
|
168
167
|
|
169
168
|
before(:each) do
|
170
169
|
register(url, '')
|
171
|
-
Rawler.stub!(:output).and_return(output)
|
172
170
|
end
|
173
171
|
|
174
172
|
context "Errno::ECONNREFUSED" do
|
@@ -250,5 +248,20 @@ describe Rawler::Crawler do
|
|
250
248
|
crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
|
251
249
|
end
|
252
250
|
end
|
251
|
+
|
252
|
+
context "invalid urls" do
|
253
|
+
let(:content) { '<a href="http://foo;bar">foo</a>' }
|
254
|
+
let(:url) { 'http://example.com' }
|
255
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
256
|
+
|
257
|
+
before(:each) do
|
258
|
+
register(url, content)
|
259
|
+
end
|
260
|
+
|
261
|
+
it "should notify about the invalid url" do
|
262
|
+
output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
|
263
|
+
crawler.links.should == []
|
264
|
+
end
|
265
|
+
end
|
253
266
|
|
254
267
|
end
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -53,16 +53,27 @@ describe Rawler::Base do
|
|
53
53
|
register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
|
54
54
|
register('http://example.com/foo2', '')
|
55
55
|
register('http://external.com', '')
|
56
|
-
register('http://external.com/foo', '',
|
56
|
+
register('http://external.com/foo', '', 301)
|
57
57
|
|
58
58
|
output.should_receive(:info).with('200 - http://example.com/foo1')
|
59
59
|
output.should_receive(:info).with('200 - http://example.com/foo2')
|
60
60
|
output.should_receive(:info).with('200 - http://external.com')
|
61
|
-
output.should_receive(:warn).with('
|
61
|
+
output.should_receive(:warn).with('301 - http://external.com/foo - Called from: http://example.com/foo1')
|
62
62
|
|
63
63
|
rawler.validate
|
64
64
|
end
|
65
65
|
|
66
|
+
it "should follow redirections but inform about them" do
|
67
|
+
register('http://example.com', '<a href="/foo">foo</a>')
|
68
|
+
register('http://example.com/foo', '', 301, :location => 'http://example.com/bar')
|
69
|
+
register('http://example.com/bar', '')
|
70
|
+
|
71
|
+
output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/bar')
|
72
|
+
output.should_receive(:info).with('200 - http://example.com/bar')
|
73
|
+
|
74
|
+
rawler.validate
|
75
|
+
end
|
76
|
+
|
66
77
|
end
|
67
78
|
|
68
79
|
describe "get_status_code" do
|
@@ -77,14 +88,14 @@ describe Rawler::Base do
|
|
77
88
|
rawler.responses[url][:status].should == 200
|
78
89
|
end
|
79
90
|
|
80
|
-
it "should add to
|
91
|
+
it "should add to 301 links" do
|
81
92
|
url = 'http://example.com/foo'
|
82
93
|
from = 'http://other.com'
|
83
|
-
register(url, '',
|
94
|
+
register(url, '', 301)
|
84
95
|
|
85
96
|
rawler.send(:add_status_code, url, from)
|
86
97
|
|
87
|
-
rawler.responses[url][:status].should ==
|
98
|
+
rawler.responses[url][:status].should == 301
|
88
99
|
end
|
89
100
|
|
90
101
|
it "should save username and password" do
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 9
|
9
|
+
version: 0.0.9
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Oscar Del Ben
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-25 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|