rawler 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rawler/base.rb CHANGED
@@ -21,29 +21,29 @@ module Rawler
21
21
 
22
22
  def validate_links_in_page(current_url)
23
23
  Rawler::Crawler.new(current_url).links.each do |page_url|
24
- validate_page(page_url)
24
+ validate_page(page_url, current_url)
25
25
  # Todo: include this in a configuration option
26
26
  sleep(3)
27
27
  end
28
28
  end
29
29
 
30
- def validate_page(page_url)
30
+ def validate_page(page_url, from_url)
31
31
  if not_yet_parsed?(page_url)
32
- add_status_code(page_url)
32
+ add_status_code(page_url, from_url)
33
33
  validate_links_in_page(page_url) if same_domain?(page_url)
34
34
  end
35
35
  end
36
36
 
37
- def add_status_code(link)
37
+ def add_status_code(link, from_url)
38
38
  response = Rawler::Request.get(link)
39
39
 
40
- record_response(response.code, link)
40
+ record_response(response.code, link, from_url)
41
41
  responses[link] = { :status => response.code.to_i }
42
42
  rescue Errno::ECONNREFUSED
43
- Rawler.output.error("Connection refused - '#{link}'")
43
+ Rawler.output.error("Connection refused - #{link} - Called from: #{from_url}")
44
44
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
45
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
46
- Rawler.output.error("Connection problems - '#{link}'")
45
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
46
+ Rawler.output.error("Connection problems - #{link} - Called from: #{from_url}")
47
47
  end
48
48
 
49
49
  def same_domain?(link)
@@ -60,8 +60,13 @@ module Rawler
60
60
  Rawler.output.error(message)
61
61
  end
62
62
 
63
- def record_response(code, link)
63
+ def record_response(code, link, from_url)
64
64
  message = "#{code} - #{link}"
65
+
66
+ if code.to_i >= 300
67
+ message += " - Called from: #{from_url}"
68
+ end
69
+
65
70
  code = code.to_i
66
71
  case code / 100
67
72
  when 1
@@ -4,6 +4,8 @@ module Rawler
4
4
 
5
5
  attr_accessor :url, :links
6
6
 
7
+ SKIP_FORMATS = /^(javascript|mailto)/
8
+
7
9
  def initialize(url)
8
10
  @url = url.strip
9
11
  end
@@ -59,7 +61,7 @@ module Rawler
59
61
  if ['http', 'https'].include?(scheme)
60
62
  true
61
63
  else
62
- write("Invalid url - #{url}")
64
+ write("Invalid url - #{url}") unless url =~ SKIP_FORMATS
63
65
  false
64
66
  end
65
67
 
data/lib/rawler.rb CHANGED
@@ -7,7 +7,7 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.7'
10
+ VERSION = '0.0.8'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
@@ -100,26 +100,48 @@ describe Rawler::Crawler do
100
100
  end
101
101
 
102
102
  context "invalid urls" do
103
- let(:url) { 'http://example.com/path' }
104
- let(:crawler) { Rawler::Crawler.new(url) }
105
- let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
106
- let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
103
+
104
+ context "javascript" do
105
+ let(:url) { 'http://example.com/path' }
106
+ let(:crawler) { Rawler::Crawler.new(url) }
107
+ let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
108
+ let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
107
109
 
108
- before(:each) do
109
- register(url, content)
110
- end
110
+ before(:each) do
111
+ register(url, content)
112
+ end
111
113
 
112
- it "should parse relative links" do
113
- crawler.links.should == []
114
+ it "should return empty links" do
115
+ crawler.links.should == []
116
+ end
117
+
118
+ it "should not report the error" do
119
+ crawler.should_not_receive(:write)
120
+ crawler.links
121
+ end
114
122
  end
115
123
 
116
- it "should report the error" do
117
- crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
118
- crawler.links
124
+ context "mailto" do
125
+ let(:url) { 'http://example.com/path' }
126
+ let(:crawler) { Rawler::Crawler.new(url) }
127
+ let(:content) { "<a href=\"mailto:example@example.com\">foo</a><a name=\"foo\">" }
128
+
129
+ before(:each) do
130
+ register(url, content)
131
+ end
132
+
133
+ it "should return empty links" do
134
+ crawler.links.should == []
135
+ end
136
+
137
+ it "should not report the error" do
138
+ crawler.should_not_receive(:write)
139
+ crawler.links
140
+ end
119
141
  end
142
+
120
143
  end
121
-
122
-
144
+
123
145
  context "content type" do
124
146
 
125
147
  ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
@@ -58,29 +58,31 @@ describe Rawler::Base do
58
58
  output.should_receive(:info).with('200 - http://example.com/foo1')
59
59
  output.should_receive(:info).with('200 - http://example.com/foo2')
60
60
  output.should_receive(:info).with('200 - http://external.com')
61
- output.should_receive(:warn).with('302 - http://external.com/foo')
61
+ output.should_receive(:warn).with('302 - http://external.com/foo - Called from: http://example.com/foo1')
62
62
 
63
63
  rawler.validate
64
64
  end
65
-
65
+
66
66
  end
67
67
 
68
68
  describe "get_status_code" do
69
69
 
70
70
  it "should add to 200 links" do
71
71
  url = 'http://example.com/foo'
72
+ from = 'http://other.com'
72
73
  register(url, '', 200)
73
74
 
74
- rawler.send(:add_status_code, url)
75
+ rawler.send(:add_status_code, url, from)
75
76
 
76
77
  rawler.responses[url][:status].should == 200
77
78
  end
78
79
 
79
80
  it "should add to 302 links" do
80
81
  url = 'http://example.com/foo'
82
+ from = 'http://other.com'
81
83
  register(url, '', 302)
82
84
 
83
- rawler.send(:add_status_code, url)
85
+ rawler.send(:add_status_code, url, from)
84
86
 
85
87
  rawler.responses[url][:status].should == 302
86
88
  end
@@ -94,24 +96,26 @@ describe Rawler::Base do
94
96
 
95
97
  it "should rescue from Errno::ECONNREFUSED" do
96
98
  url = 'http://example.com'
99
+ from = 'http://other.com'
97
100
 
98
101
  Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
99
102
 
100
- output.should_receive(:error).with("Connection refused - '#{url}'")
103
+ output.should_receive(:error).with("Connection refused - #{url} - Called from: #{from}")
101
104
 
102
- rawler.send(:add_status_code, url)
105
+ rawler.send(:add_status_code, url, from)
103
106
  end
104
107
 
105
108
  [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
106
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
109
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError].each do |error|
107
110
  it "should rescue from #{error}" do
108
111
  url = 'http://example.com'
112
+ from = 'http://other.com'
109
113
 
110
114
  Rawler::Request.should_receive(:get).and_raise error
111
115
 
112
- output.should_receive(:error).with("Connection problems - '#{url}'")
116
+ output.should_receive(:error).with("Connection problems - #{url} - Called from: #{from}")
113
117
 
114
- rawler.send(:add_status_code, url)
118
+ rawler.send(:add_status_code, url, from)
115
119
  end
116
120
  end
117
121
 
@@ -119,14 +123,15 @@ describe Rawler::Base do
119
123
 
120
124
  describe "record_response" do
121
125
 
122
- let(:message) { 'foo' }
126
+ let(:link) { 'http://foo.com' }
127
+ let(:from) { 'http://bar.com' }
123
128
 
124
129
  context "response code 100" do
125
130
  %w!100, 150, 199!.each do |code|
126
131
 
127
132
  it "logger should receive info" do
128
- output.should_receive(:info).with("#{code} - #{message}")
129
- rawler.send(:record_response, code, message)
133
+ output.should_receive(:info).with("#{code} - #{link}")
134
+ rawler.send(:record_response, code, link, from)
130
135
  end
131
136
 
132
137
  end
@@ -136,8 +141,8 @@ describe Rawler::Base do
136
141
  %w!200, 250, 299!.each do |code|
137
142
 
138
143
  it "logger should receive info" do
139
- output.should_receive(:info).with("#{code} - #{message}")
140
- rawler.send(:record_response, code, message)
144
+ output.should_receive(:info).with("#{code} - #{link}")
145
+ rawler.send(:record_response, code, link, from)
141
146
  end
142
147
 
143
148
  end
@@ -147,8 +152,8 @@ describe Rawler::Base do
147
152
  %w!300, 350, 399!.each do |code|
148
153
 
149
154
  it "logger should receive warn" do
150
- output.should_receive(:warn).with("#{code} - #{message}")
151
- rawler.send(:record_response, code, message)
155
+ output.should_receive(:warn).with("#{code} - #{link} - Called from: #{from}")
156
+ rawler.send(:record_response, code, link, from)
152
157
  end
153
158
 
154
159
  end
@@ -158,8 +163,8 @@ describe Rawler::Base do
158
163
  %w!400, 450, 499!.each do |code|
159
164
 
160
165
  it "logger should receive info" do
161
- output.should_receive(:error).with("#{code} - #{message}")
162
- rawler.send(:record_response, code, message)
166
+ output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
167
+ rawler.send(:record_response, code, link, from)
163
168
  end
164
169
 
165
170
  end
@@ -169,8 +174,8 @@ describe Rawler::Base do
169
174
  %w!400, 550, 599!.each do |code|
170
175
 
171
176
  it "logger should receive info" do
172
- output.should_receive(:error).with("#{code} - #{message}")
173
- rawler.send(:record_response, code, message)
177
+ output.should_receive(:error).with("#{code} - #{link} - Called from: #{from}")
178
+ rawler.send(:record_response, code, link, from)
174
179
  end
175
180
 
176
181
  end
@@ -180,8 +185,8 @@ describe Rawler::Base do
180
185
  let(:code) { 600 }
181
186
 
182
187
  it "logger should receive eror" do
183
- output.should_receive(:error).with("Unknown code #{code} - #{message}")
184
- rawler.send(:record_response, code, message)
188
+ output.should_receive(:error).with("Unknown code #{code} - #{link} - Called from: #{from}")
189
+ rawler.send(:record_response, code, link, from)
185
190
  end
186
191
  end
187
192
 
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 7
9
- version: 0.0.7
8
+ - 8
9
+ version: 0.0.8
10
10
  platform: ruby
11
11
  authors:
12
12
  - Oscar Del Ben
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-07 00:00:00 +01:00
17
+ date: 2011-03-17 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency