rawler 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rawler/base.rb +14 -9
- data/lib/rawler/crawler.rb +3 -1
- data/lib/rawler.rb +1 -1
- data/spec/lib/rawler/crawler_spec.rb +36 -14
- data/spec/lib/rawler_spec.rb +27 -22
- metadata +3 -3
data/lib/rawler/base.rb
CHANGED
@@ -21,29 +21,29 @@ module Rawler
|
|
21
21
|
|
22
22
|
def validate_links_in_page(current_url)
|
23
23
|
Rawler::Crawler.new(current_url).links.each do |page_url|
|
24
|
-
validate_page(page_url)
|
24
|
+
validate_page(page_url, current_url)
|
25
25
|
# Todo: include this in a configuration option
|
26
26
|
sleep(3)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
def validate_page(page_url)
|
30
|
+
def validate_page(page_url, from_url)
|
31
31
|
if not_yet_parsed?(page_url)
|
32
|
-
add_status_code(page_url)
|
32
|
+
add_status_code(page_url, from_url)
|
33
33
|
validate_links_in_page(page_url) if same_domain?(page_url)
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
def add_status_code(link)
|
37
|
+
def add_status_code(link, from_url)
|
38
38
|
response = Rawler::Request.get(link)
|
39
39
|
|
40
|
-
record_response(response.code, link)
|
40
|
+
record_response(response.code, link, from_url)
|
41
41
|
responses[link] = { :status => response.code.to_i }
|
42
42
|
rescue Errno::ECONNREFUSED
|
43
|
-
Rawler.output.error("Connection refused -
|
43
|
+
Rawler.output.error("Connection refused - #{link} - Called from: #{from_url}")
|
44
44
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
45
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
46
|
-
Rawler.output.error("Connection problems -
|
45
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
|
46
|
+
Rawler.output.error("Connection problems - #{link} - Called from: #{from_url}")
|
47
47
|
end
|
48
48
|
|
49
49
|
def same_domain?(link)
|
@@ -60,8 +60,13 @@ module Rawler
|
|
60
60
|
Rawler.output.error(message)
|
61
61
|
end
|
62
62
|
|
63
|
-
def record_response(code, link)
|
63
|
+
def record_response(code, link, from_url)
|
64
64
|
message = "#{code} - #{link}"
|
65
|
+
|
66
|
+
if code.to_i >= 300
|
67
|
+
message += " - Called from: #{from_url}"
|
68
|
+
end
|
69
|
+
|
65
70
|
code = code.to_i
|
66
71
|
case code / 100
|
67
72
|
when 1
|
data/lib/rawler/crawler.rb
CHANGED
@@ -4,6 +4,8 @@ module Rawler
|
|
4
4
|
|
5
5
|
attr_accessor :url, :links
|
6
6
|
|
7
|
+
SKIP_FORMATS = /^(javascript|mailto)/
|
8
|
+
|
7
9
|
def initialize(url)
|
8
10
|
@url = url.strip
|
9
11
|
end
|
@@ -59,7 +61,7 @@ module Rawler
|
|
59
61
|
if ['http', 'https'].include?(scheme)
|
60
62
|
true
|
61
63
|
else
|
62
|
-
write("Invalid url - #{url}")
|
64
|
+
write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
|
63
65
|
false
|
64
66
|
end
|
65
67
|
|
data/lib/rawler.rb
CHANGED
@@ -100,26 +100,48 @@ describe Rawler::Crawler do
|
|
100
100
|
end
|
101
101
|
|
102
102
|
context "invalid urls" do
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
103
|
+
|
104
|
+
context "javascript" do
|
105
|
+
let(:url) { 'http://example.com/path' }
|
106
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
107
|
+
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
108
|
+
let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
-
|
110
|
+
before(:each) do
|
111
|
+
register(url, content)
|
112
|
+
end
|
111
113
|
|
112
|
-
|
113
|
-
|
114
|
+
it "should return empty links" do
|
115
|
+
crawler.links.should == []
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should not report the error" do
|
119
|
+
crawler.should_not_receive(:write)
|
120
|
+
crawler.links
|
121
|
+
end
|
114
122
|
end
|
115
123
|
|
116
|
-
|
117
|
-
|
118
|
-
crawler.
|
124
|
+
context "mailto" do
|
125
|
+
let(:url) { 'http://example.com/path' }
|
126
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
127
|
+
let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
|
128
|
+
|
129
|
+
before(:each) do
|
130
|
+
register(url, content)
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should return empty links" do
|
134
|
+
crawler.links.should == []
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should not report the error" do
|
138
|
+
crawler.should_not_receive(:write)
|
139
|
+
crawler.links
|
140
|
+
end
|
119
141
|
end
|
142
|
+
|
120
143
|
end
|
121
|
-
|
122
|
-
|
144
|
+
|
123
145
|
context "content type" do
|
124
146
|
|
125
147
|
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -58,29 +58,31 @@ describe Rawler::Base do
|
|
58
58
|
output.should_receive(:info).with('200 - http://example.com/foo1')
|
59
59
|
output.should_receive(:info).with('200 - http://example.com/foo2')
|
60
60
|
output.should_receive(:info).with('200 - http://external.com')
|
61
|
-
output.should_receive(:warn).with('302 - http://external.com/foo')
|
61
|
+
output.should_receive(:warn).with('302 - http://external.com/foo - Called from: http://example.com/foo1')
|
62
62
|
|
63
63
|
rawler.validate
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
end
|
67
67
|
|
68
68
|
describe "get_status_code" do
|
69
69
|
|
70
70
|
it "should add to 200 links" do
|
71
71
|
url = 'http://example.com/foo'
|
72
|
+
from = 'http://other.com'
|
72
73
|
register(url, '', 200)
|
73
74
|
|
74
|
-
rawler.send(:add_status_code, url)
|
75
|
+
rawler.send(:add_status_code, url, from)
|
75
76
|
|
76
77
|
rawler.responses[url][:status].should == 200
|
77
78
|
end
|
78
79
|
|
79
80
|
it "should add to 302 links" do
|
80
81
|
url = 'http://example.com/foo'
|
82
|
+
from = 'http://other.com'
|
81
83
|
register(url, '', 302)
|
82
84
|
|
83
|
-
rawler.send(:add_status_code, url)
|
85
|
+
rawler.send(:add_status_code, url, from)
|
84
86
|
|
85
87
|
rawler.responses[url][:status].should == 302
|
86
88
|
end
|
@@ -94,24 +96,26 @@ describe Rawler::Base do
|
|
94
96
|
|
95
97
|
it "should rescue from Errno::ECONNREFUSED" do
|
96
98
|
url = 'http://example.com'
|
99
|
+
from = 'http://other.com'
|
97
100
|
|
98
101
|
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
99
102
|
|
100
|
-
output.should_receive(:error).with("Connection refused -
|
103
|
+
output.should_receive(:error).with("Connection refused - #{url} - Called from: #{from}")
|
101
104
|
|
102
|
-
rawler.send(:add_status_code, url)
|
105
|
+
rawler.send(:add_status_code, url, from)
|
103
106
|
end
|
104
107
|
|
105
108
|
[Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
|
106
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
|
109
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError].each do |error|
|
107
110
|
it "should rescue from #{error}" do
|
108
111
|
url = 'http://example.com'
|
112
|
+
from = 'http://other.com'
|
109
113
|
|
110
114
|
Rawler::Request.should_receive(:get).and_raise error
|
111
115
|
|
112
|
-
output.should_receive(:error).with("Connection problems -
|
116
|
+
output.should_receive(:error).with("Connection problems - #{url} - Called from: #{from}")
|
113
117
|
|
114
|
-
rawler.send(:add_status_code, url)
|
118
|
+
rawler.send(:add_status_code, url, from)
|
115
119
|
end
|
116
120
|
end
|
117
121
|
|
@@ -119,14 +123,15 @@ describe Rawler::Base do
|
|
119
123
|
|
120
124
|
describe "record_response" do
|
121
125
|
|
122
|
-
let(:
|
126
|
+
let(:link) { 'http://foo.com' }
|
127
|
+
let(:from) { 'http://bar.com' }
|
123
128
|
|
124
129
|
context "response code 100" do
|
125
130
|
%w!100, 150, 199!.each do |code|
|
126
131
|
|
127
132
|
it "logger should receive info" do
|
128
|
-
output.should_receive(:info).with("#{code} - #{
|
129
|
-
rawler.send(:record_response, code,
|
133
|
+
output.should_receive(:info).with("#{code} - #{link}")
|
134
|
+
rawler.send(:record_response, code, link, from)
|
130
135
|
end
|
131
136
|
|
132
137
|
end
|
@@ -136,8 +141,8 @@ describe Rawler::Base do
|
|
136
141
|
%w!200, 250, 299!.each do |code|
|
137
142
|
|
138
143
|
it "logger should receive info" do
|
139
|
-
output.should_receive(:info).with("#{code} - #{
|
140
|
-
rawler.send(:record_response, code,
|
144
|
+
output.should_receive(:info).with("#{code} - #{link}")
|
145
|
+
rawler.send(:record_response, code, link, from)
|
141
146
|
end
|
142
147
|
|
143
148
|
end
|
@@ -147,8 +152,8 @@ describe Rawler::Base do
|
|
147
152
|
%w!300, 350, 399!.each do |code|
|
148
153
|
|
149
154
|
it "logger should receive warn" do
|
150
|
-
output.should_receive(:warn).with("#{code} - #{
|
151
|
-
rawler.send(:record_response, code,
|
155
|
+
output.should_receive(:warn).with("#{code} - #{link} - Called from: #{from}")
|
156
|
+
rawler.send(:record_response, code, link, from)
|
152
157
|
end
|
153
158
|
|
154
159
|
end
|
@@ -158,8 +163,8 @@ describe Rawler::Base do
|
|
158
163
|
%w!400, 450, 499!.each do |code|
|
159
164
|
|
160
165
|
it "logger should receive info" do
|
161
|
-
output.should_receive(:error).with("#{code} - #{
|
162
|
-
rawler.send(:record_response, code,
|
166
|
+
output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
|
167
|
+
rawler.send(:record_response, code, link, from)
|
163
168
|
end
|
164
169
|
|
165
170
|
end
|
@@ -169,8 +174,8 @@ describe Rawler::Base do
|
|
169
174
|
%w!400, 550, 599!.each do |code|
|
170
175
|
|
171
176
|
it "logger should receive info" do
|
172
|
-
output.should_receive(:error).with("#{code} - #{
|
173
|
-
rawler.send(:record_response, code,
|
177
|
+
output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
|
178
|
+
rawler.send(:record_response, code, link, from)
|
174
179
|
end
|
175
180
|
|
176
181
|
end
|
@@ -180,8 +185,8 @@ describe Rawler::Base do
|
|
180
185
|
let(:code) { 600 }
|
181
186
|
|
182
187
|
it "logger should receive eror" do
|
183
|
-
output.should_receive(:error).with("Unknown code #{code} - #{
|
184
|
-
rawler.send(:record_response, code,
|
188
|
+
output.should_receive(:error).with("Unknown code #{code} - #{link} - Called from: #{from}")
|
189
|
+
rawler.send(:record_response, code, link, from)
|
185
190
|
end
|
186
191
|
end
|
187
192
|
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 8
|
9
|
+
version: 0.0.8
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Oscar Del Ben
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-17 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|