rawler 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rawler/base.rb +14 -9
- data/lib/rawler/crawler.rb +3 -1
- data/lib/rawler.rb +1 -1
- data/spec/lib/rawler/crawler_spec.rb +36 -14
- data/spec/lib/rawler_spec.rb +27 -22
- metadata +3 -3
data/lib/rawler/base.rb
CHANGED
@@ -21,29 +21,29 @@ module Rawler
|
|
21
21
|
|
22
22
|
def validate_links_in_page(current_url)
|
23
23
|
Rawler::Crawler.new(current_url).links.each do |page_url|
|
24
|
-
validate_page(page_url)
|
24
|
+
validate_page(page_url, current_url)
|
25
25
|
# Todo: include this in a configuration option
|
26
26
|
sleep(3)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
def validate_page(page_url)
|
30
|
+
def validate_page(page_url, from_url)
|
31
31
|
if not_yet_parsed?(page_url)
|
32
|
-
add_status_code(page_url)
|
32
|
+
add_status_code(page_url, from_url)
|
33
33
|
validate_links_in_page(page_url) if same_domain?(page_url)
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
def add_status_code(link)
|
37
|
+
def add_status_code(link, from_url)
|
38
38
|
response = Rawler::Request.get(link)
|
39
39
|
|
40
|
-
record_response(response.code, link)
|
40
|
+
record_response(response.code, link, from_url)
|
41
41
|
responses[link] = { :status => response.code.to_i }
|
42
42
|
rescue Errno::ECONNREFUSED
|
43
|
-
Rawler.output.error("Connection refused -
|
43
|
+
Rawler.output.error("Connection refused - #{link} - Called from: #{from_url}")
|
44
44
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
45
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
46
|
-
Rawler.output.error("Connection problems -
|
45
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
|
46
|
+
Rawler.output.error("Connection problems - #{link} - Called from: #{from_url}")
|
47
47
|
end
|
48
48
|
|
49
49
|
def same_domain?(link)
|
@@ -60,8 +60,13 @@ module Rawler
|
|
60
60
|
Rawler.output.error(message)
|
61
61
|
end
|
62
62
|
|
63
|
-
def record_response(code, link)
|
63
|
+
def record_response(code, link, from_url)
|
64
64
|
message = "#{code} - #{link}"
|
65
|
+
|
66
|
+
if code.to_i >= 300
|
67
|
+
message += " - Called from: #{from_url}"
|
68
|
+
end
|
69
|
+
|
65
70
|
code = code.to_i
|
66
71
|
case code / 100
|
67
72
|
when 1
|
data/lib/rawler/crawler.rb
CHANGED
@@ -4,6 +4,8 @@ module Rawler
|
|
4
4
|
|
5
5
|
attr_accessor :url, :links
|
6
6
|
|
7
|
+
SKIP_FORMATS = /^(javascript|mailto)/
|
8
|
+
|
7
9
|
def initialize(url)
|
8
10
|
@url = url.strip
|
9
11
|
end
|
@@ -59,7 +61,7 @@ module Rawler
|
|
59
61
|
if ['http', 'https'].include?(scheme)
|
60
62
|
true
|
61
63
|
else
|
62
|
-
write("Invalid url - #{url}")
|
64
|
+
write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
|
63
65
|
false
|
64
66
|
end
|
65
67
|
|
data/lib/rawler.rb
CHANGED
@@ -100,26 +100,48 @@ describe Rawler::Crawler do
|
|
100
100
|
end
|
101
101
|
|
102
102
|
context "invalid urls" do
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
103
|
+
|
104
|
+
context "javascript" do
|
105
|
+
let(:url) { 'http://example.com/path' }
|
106
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
107
|
+
let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
|
108
|
+
let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
-
|
110
|
+
before(:each) do
|
111
|
+
register(url, content)
|
112
|
+
end
|
111
113
|
|
112
|
-
|
113
|
-
|
114
|
+
it "should return empty links" do
|
115
|
+
crawler.links.should == []
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should not report the error" do
|
119
|
+
crawler.should_not_receive(:write)
|
120
|
+
crawler.links
|
121
|
+
end
|
114
122
|
end
|
115
123
|
|
116
|
-
|
117
|
-
|
118
|
-
crawler.
|
124
|
+
context "mailto" do
|
125
|
+
let(:url) { 'http://example.com/path' }
|
126
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
127
|
+
let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
|
128
|
+
|
129
|
+
before(:each) do
|
130
|
+
register(url, content)
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should return empty links" do
|
134
|
+
crawler.links.should == []
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should not report the error" do
|
138
|
+
crawler.should_not_receive(:write)
|
139
|
+
crawler.links
|
140
|
+
end
|
119
141
|
end
|
142
|
+
|
120
143
|
end
|
121
|
-
|
122
|
-
|
144
|
+
|
123
145
|
context "content type" do
|
124
146
|
|
125
147
|
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -58,29 +58,31 @@ describe Rawler::Base do
|
|
58
58
|
output.should_receive(:info).with('200 - http://example.com/foo1')
|
59
59
|
output.should_receive(:info).with('200 - http://example.com/foo2')
|
60
60
|
output.should_receive(:info).with('200 - http://external.com')
|
61
|
-
output.should_receive(:warn).with('302 - http://external.com/foo')
|
61
|
+
output.should_receive(:warn).with('302 - http://external.com/foo - Called from: http://example.com/foo1')
|
62
62
|
|
63
63
|
rawler.validate
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
end
|
67
67
|
|
68
68
|
describe "get_status_code" do
|
69
69
|
|
70
70
|
it "should add to 200 links" do
|
71
71
|
url = 'http://example.com/foo'
|
72
|
+
from = 'http://other.com'
|
72
73
|
register(url, '', 200)
|
73
74
|
|
74
|
-
rawler.send(:add_status_code, url)
|
75
|
+
rawler.send(:add_status_code, url, from)
|
75
76
|
|
76
77
|
rawler.responses[url][:status].should == 200
|
77
78
|
end
|
78
79
|
|
79
80
|
it "should add to 302 links" do
|
80
81
|
url = 'http://example.com/foo'
|
82
|
+
from = 'http://other.com'
|
81
83
|
register(url, '', 302)
|
82
84
|
|
83
|
-
rawler.send(:add_status_code, url)
|
85
|
+
rawler.send(:add_status_code, url, from)
|
84
86
|
|
85
87
|
rawler.responses[url][:status].should == 302
|
86
88
|
end
|
@@ -94,24 +96,26 @@ describe Rawler::Base do
|
|
94
96
|
|
95
97
|
it "should rescue from Errno::ECONNREFUSED" do
|
96
98
|
url = 'http://example.com'
|
99
|
+
from = 'http://other.com'
|
97
100
|
|
98
101
|
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
99
102
|
|
100
|
-
output.should_receive(:error).with("Connection refused -
|
103
|
+
output.should_receive(:error).with("Connection refused - #{url} - Called from: #{from}")
|
101
104
|
|
102
|
-
rawler.send(:add_status_code, url)
|
105
|
+
rawler.send(:add_status_code, url, from)
|
103
106
|
end
|
104
107
|
|
105
108
|
[Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
|
106
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
|
109
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError].each do |error|
|
107
110
|
it "should rescue from #{error}" do
|
108
111
|
url = 'http://example.com'
|
112
|
+
from = 'http://other.com'
|
109
113
|
|
110
114
|
Rawler::Request.should_receive(:get).and_raise error
|
111
115
|
|
112
|
-
output.should_receive(:error).with("Connection problems -
|
116
|
+
output.should_receive(:error).with("Connection problems - #{url} - Called from: #{from}")
|
113
117
|
|
114
|
-
rawler.send(:add_status_code, url)
|
118
|
+
rawler.send(:add_status_code, url, from)
|
115
119
|
end
|
116
120
|
end
|
117
121
|
|
@@ -119,14 +123,15 @@ describe Rawler::Base do
|
|
119
123
|
|
120
124
|
describe "record_response" do
|
121
125
|
|
122
|
-
let(:
|
126
|
+
let(:link) { 'http://foo.com' }
|
127
|
+
let(:from) { 'http://bar.com' }
|
123
128
|
|
124
129
|
context "response code 100" do
|
125
130
|
%w!100, 150, 199!.each do |code|
|
126
131
|
|
127
132
|
it "logger should receive info" do
|
128
|
-
output.should_receive(:info).with("#{code} - #{
|
129
|
-
rawler.send(:record_response, code,
|
133
|
+
output.should_receive(:info).with("#{code} - #{link}")
|
134
|
+
rawler.send(:record_response, code, link, from)
|
130
135
|
end
|
131
136
|
|
132
137
|
end
|
@@ -136,8 +141,8 @@ describe Rawler::Base do
|
|
136
141
|
%w!200, 250, 299!.each do |code|
|
137
142
|
|
138
143
|
it "logger should receive info" do
|
139
|
-
output.should_receive(:info).with("#{code} - #{
|
140
|
-
rawler.send(:record_response, code,
|
144
|
+
output.should_receive(:info).with("#{code} - #{link}")
|
145
|
+
rawler.send(:record_response, code, link, from)
|
141
146
|
end
|
142
147
|
|
143
148
|
end
|
@@ -147,8 +152,8 @@ describe Rawler::Base do
|
|
147
152
|
%w!300, 350, 399!.each do |code|
|
148
153
|
|
149
154
|
it "logger should receive warn" do
|
150
|
-
output.should_receive(:warn).with("#{code} - #{
|
151
|
-
rawler.send(:record_response, code,
|
155
|
+
output.should_receive(:warn).with("#{code} - #{link} - Called from: #{from}")
|
156
|
+
rawler.send(:record_response, code, link, from)
|
152
157
|
end
|
153
158
|
|
154
159
|
end
|
@@ -158,8 +163,8 @@ describe Rawler::Base do
|
|
158
163
|
%w!400, 450, 499!.each do |code|
|
159
164
|
|
160
165
|
it "logger should receive info" do
|
161
|
-
output.should_receive(:error).with("#{code} - #{
|
162
|
-
rawler.send(:record_response, code,
|
166
|
+
output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
|
167
|
+
rawler.send(:record_response, code, link, from)
|
163
168
|
end
|
164
169
|
|
165
170
|
end
|
@@ -169,8 +174,8 @@ describe Rawler::Base do
|
|
169
174
|
%w!400, 550, 599!.each do |code|
|
170
175
|
|
171
176
|
it "logger should receive info" do
|
172
|
-
output.should_receive(:error).with("#{code} - #{
|
173
|
-
rawler.send(:record_response, code,
|
177
|
+
output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
|
178
|
+
rawler.send(:record_response, code, link, from)
|
174
179
|
end
|
175
180
|
|
176
181
|
end
|
@@ -180,8 +185,8 @@ describe Rawler::Base do
|
|
180
185
|
let(:code) { 600 }
|
181
186
|
|
182
187
|
it "logger should receive eror" do
|
183
|
-
output.should_receive(:error).with("Unknown code #{code} - #{
|
184
|
-
rawler.send(:record_response, code,
|
188
|
+
output.should_receive(:error).with("Unknown code #{code} - #{link} - Called from: #{from}")
|
189
|
+
rawler.send(:record_response, code, link, from)
|
185
190
|
end
|
186
191
|
end
|
187
192
|
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 8
|
9
|
+
version: 0.0.8
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Oscar Del Ben
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-17 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|