rawler 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rawler/base.rb CHANGED
@@ -21,29 +21,29 @@ module Rawler
21
21
 
22
22
  def validate_links_in_page(current_url)
23
23
  Rawler::Crawler.new(current_url).links.each do |page_url|
24
- validate_page(page_url)
24
+ validate_page(page_url, current_url)
25
25
  # Todo: include this in a configuration option
26
26
  sleep(3)
27
27
  end
28
28
  end
29
29
 
30
- def validate_page(page_url)
30
+ def validate_page(page_url, from_url)
31
31
  if not_yet_parsed?(page_url)
32
- add_status_code(page_url)
32
+ add_status_code(page_url, from_url)
33
33
  validate_links_in_page(page_url) if same_domain?(page_url)
34
34
  end
35
35
  end
36
36
 
37
- def add_status_code(link)
37
+ def add_status_code(link, from_url)
38
38
  response = Rawler::Request.get(link)
39
39
 
40
- record_response(response.code, link)
40
+ record_response(response.code, link, from_url)
41
41
  responses[link] = { :status => response.code.to_i }
42
42
  rescue Errno::ECONNREFUSED
43
- Rawler.output.error("Connection refused - '#{link}'")
43
+ Rawler.output.error("Connection refused - #{link} - Called from: #{from_url}")
44
44
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
45
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
46
- Rawler.output.error("Connection problems - '#{link}'")
45
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
46
+ Rawler.output.error("Connection problems - #{link} - Called from: #{from_url}")
47
47
  end
48
48
 
49
49
  def same_domain?(link)
@@ -60,8 +60,13 @@ module Rawler
60
60
  Rawler.output.error(message)
61
61
  end
62
62
 
63
- def record_response(code, link)
63
+ def record_response(code, link, from_url)
64
64
  message = "#{code} - #{link}"
65
+
66
+ if code.to_i >= 300
67
+ message += " - Called from: #{from_url}"
68
+ end
69
+
65
70
  code = code.to_i
66
71
  case code / 100
67
72
  when 1
@@ -4,6 +4,8 @@ module Rawler
4
4
 
5
5
  attr_accessor :url, :links
6
6
 
7
+ SKIP_FORMATS = /^(javascript|mailto)/
8
+
7
9
  def initialize(url)
8
10
  @url = url.strip
9
11
  end
@@ -59,7 +61,7 @@ module Rawler
59
61
  if ['http', 'https'].include?(scheme)
60
62
  true
61
63
  else
62
- write("Invalid url - #{url}")
64
+ write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
63
65
  false
64
66
  end
65
67
 
data/lib/rawler.rb CHANGED
@@ -7,7 +7,7 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.7'
10
+ VERSION = '0.0.8'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
@@ -100,26 +100,48 @@ describe Rawler::Crawler do
100
100
  end
101
101
 
102
102
  context "invalid urls" do
103
- let(:url) { 'http://example.com/path' }
104
- let(:crawler) { Rawler::Crawler.new(url) }
105
- let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
106
- let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
103
+
104
+ context "javascript" do
105
+ let(:url) { 'http://example.com/path' }
106
+ let(:crawler) { Rawler::Crawler.new(url) }
107
+ let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
108
+ let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
107
109
 
108
- before(:each) do
109
- register(url, content)
110
- end
110
+ before(:each) do
111
+ register(url, content)
112
+ end
111
113
 
112
- it "should parse relative links" do
113
- crawler.links.should == []
114
+ it "should return empty links" do
115
+ crawler.links.should == []
116
+ end
117
+
118
+ it "should not report the error" do
119
+ crawler.should_not_receive(:write)
120
+ crawler.links
121
+ end
114
122
  end
115
123
 
116
- it "should report the error" do
117
- crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
118
- crawler.links
124
+ context "mailto" do
125
+ let(:url) { 'http://example.com/path' }
126
+ let(:crawler) { Rawler::Crawler.new(url) }
127
+ let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
128
+
129
+ before(:each) do
130
+ register(url, content)
131
+ end
132
+
133
+ it "should return empty links" do
134
+ crawler.links.should == []
135
+ end
136
+
137
+ it "should not report the error" do
138
+ crawler.should_not_receive(:write)
139
+ crawler.links
140
+ end
119
141
  end
142
+
120
143
  end
121
-
122
-
144
+
123
145
  context "content type" do
124
146
 
125
147
  ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
@@ -58,29 +58,31 @@ describe Rawler::Base do
58
58
  output.should_receive(:info).with('200 - http://example.com/foo1')
59
59
  output.should_receive(:info).with('200 - http://example.com/foo2')
60
60
  output.should_receive(:info).with('200 - http://external.com')
61
- output.should_receive(:warn).with('302 - http://external.com/foo')
61
+ output.should_receive(:warn).with('302 - http://external.com/foo - Called from: http://example.com/foo1')
62
62
 
63
63
  rawler.validate
64
64
  end
65
-
65
+
66
66
  end
67
67
 
68
68
  describe "get_status_code" do
69
69
 
70
70
  it "should add to 200 links" do
71
71
  url = 'http://example.com/foo'
72
+ from = 'http://other.com'
72
73
  register(url, '', 200)
73
74
 
74
- rawler.send(:add_status_code, url)
75
+ rawler.send(:add_status_code, url, from)
75
76
 
76
77
  rawler.responses[url][:status].should == 200
77
78
  end
78
79
 
79
80
  it "should add to 302 links" do
80
81
  url = 'http://example.com/foo'
82
+ from = 'http://other.com'
81
83
  register(url, '', 302)
82
84
 
83
- rawler.send(:add_status_code, url)
85
+ rawler.send(:add_status_code, url, from)
84
86
 
85
87
  rawler.responses[url][:status].should == 302
86
88
  end
@@ -94,24 +96,26 @@ describe Rawler::Base do
94
96
 
95
97
  it "should rescue from Errno::ECONNREFUSED" do
96
98
  url = 'http://example.com'
99
+ from = 'http://other.com'
97
100
 
98
101
  Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
99
102
 
100
- output.should_receive(:error).with("Connection refused - '#{url}'")
103
+ output.should_receive(:error).with("Connection refused - #{url} - Called from: #{from}")
101
104
 
102
- rawler.send(:add_status_code, url)
105
+ rawler.send(:add_status_code, url, from)
103
106
  end
104
107
 
105
108
  [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
106
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
109
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError].each do |error|
107
110
  it "should rescue from #{error}" do
108
111
  url = 'http://example.com'
112
+ from = 'http://other.com'
109
113
 
110
114
  Rawler::Request.should_receive(:get).and_raise error
111
115
 
112
- output.should_receive(:error).with("Connection problems - '#{url}'")
116
+ output.should_receive(:error).with("Connection problems - #{url} - Called from: #{from}")
113
117
 
114
- rawler.send(:add_status_code, url)
118
+ rawler.send(:add_status_code, url, from)
115
119
  end
116
120
  end
117
121
 
@@ -119,14 +123,15 @@ describe Rawler::Base do
119
123
 
120
124
  describe "record_response" do
121
125
 
122
- let(:message) { 'foo' }
126
+ let(:link) { 'http://foo.com' }
127
+ let(:from) { 'http://bar.com' }
123
128
 
124
129
  context "response code 100" do
125
130
  %w!100, 150, 199!.each do |code|
126
131
 
127
132
  it "logger should receive info" do
128
- output.should_receive(:info).with("#{code} - #{message}")
129
- rawler.send(:record_response, code, message)
133
+ output.should_receive(:info).with("#{code} - #{link}")
134
+ rawler.send(:record_response, code, link, from)
130
135
  end
131
136
 
132
137
  end
@@ -136,8 +141,8 @@ describe Rawler::Base do
136
141
  %w!200, 250, 299!.each do |code|
137
142
 
138
143
  it "logger should receive info" do
139
- output.should_receive(:info).with("#{code} - #{message}")
140
- rawler.send(:record_response, code, message)
144
+ output.should_receive(:info).with("#{code} - #{link}")
145
+ rawler.send(:record_response, code, link, from)
141
146
  end
142
147
 
143
148
  end
@@ -147,8 +152,8 @@ describe Rawler::Base do
147
152
  %w!300, 350, 399!.each do |code|
148
153
 
149
154
  it "logger should receive warn" do
150
- output.should_receive(:warn).with("#{code} - #{message}")
151
- rawler.send(:record_response, code, message)
155
+ output.should_receive(:warn).with("#{code} - #{link} - Called from: #{from}")
156
+ rawler.send(:record_response, code, link, from)
152
157
  end
153
158
 
154
159
  end
@@ -158,8 +163,8 @@ describe Rawler::Base do
158
163
  %w!400, 450, 499!.each do |code|
159
164
 
160
165
  it "logger should receive info" do
161
- output.should_receive(:error).with("#{code} - #{message}")
162
- rawler.send(:record_response, code, message)
166
+ output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
167
+ rawler.send(:record_response, code, link, from)
163
168
  end
164
169
 
165
170
  end
@@ -169,8 +174,8 @@ describe Rawler::Base do
169
174
  %w!400, 550, 599!.each do |code|
170
175
 
171
176
  it "logger should receive info" do
172
- output.should_receive(:error).with("#{code} - #{message}")
173
- rawler.send(:record_response, code, message)
177
+ output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
178
+ rawler.send(:record_response, code, link, from)
174
179
  end
175
180
 
176
181
  end
@@ -180,8 +185,8 @@ describe Rawler::Base do
180
185
  let(:code) { 600 }
181
186
 
182
187
  it "logger should receive eror" do
183
- output.should_receive(:error).with("Unknown code #{code} - #{message}")
184
- rawler.send(:record_response, code, message)
188
+ output.should_receive(:error).with("Unknown code #{code} - #{link} - Called from: #{from}")
189
+ rawler.send(:record_response, code, link, from)
185
190
  end
186
191
  end
187
192
 
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 7
9
- version: 0.0.7
8
+ - 8
9
+ version: 0.0.8
10
10
  platform: ruby
11
11
  authors:
12
12
  - Oscar Del Ben
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-07 00:00:00 +01:00
17
+ date: 2011-03-17 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency