rawler 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/README.txt CHANGED
@@ -13,8 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
13
13
  rawler http://example.com [options]
14
14
 
15
15
  where [options] are:
16
- --username, -u <s>: HTT Basic Username
17
- --password, -p <s>: HTT Basic Password
16
+ --username, -u <s>: HTTP Basic Username
17
+ --password, -p <s>: HTTP Basic Password
18
18
  --version, -v: Print version and exit
19
19
  --help, -h: Show this message
20
20
 
@@ -40,13 +40,6 @@ If you add files, run:
40
40
 
41
41
  And add them to the Manifest file.
42
42
 
43
- == TODO
44
-
45
- * Add logger levels
46
- * Follow redirects, but still inform about them
47
- * Respect robots.txt
48
- * Export to html
49
-
50
43
  == CONTRIBUTORS:
51
44
 
52
45
  * bcoob
@@ -7,7 +7,7 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.8'
10
+ VERSION = '0.0.9'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
@@ -36,14 +36,16 @@ module Rawler
36
36
 
37
37
  def add_status_code(link, from_url)
38
38
  response = Rawler::Request.get(link)
39
+
40
+ validate_page(response['Location'], from_url) if response['Location']
39
41
 
40
- record_response(response.code, link, from_url)
42
+ record_response(response.code, link, from_url, response['Location'])
41
43
  responses[link] = { :status => response.code.to_i }
42
44
  rescue Errno::ECONNREFUSED
43
- Rawler.output.error("Connection refused - #{link} - Called from: #{from_url}")
45
+ error("Connection refused - #{link} - Called from: #{from_url}")
44
46
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
45
47
  EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
46
- Rawler.output.error("Connection problems - #{link} - Called from: #{from_url}")
48
+ error("Connection problems - #{link} - Called from: #{from_url}")
47
49
  end
48
50
 
49
51
  def same_domain?(link)
@@ -54,18 +56,18 @@ module Rawler
54
56
  responses[link].nil?
55
57
  end
56
58
 
57
- def write(message)
58
- # TODO: This may not always be an error message,
59
- # but that will make it show up most of the time
59
+ def error(message)
60
60
  Rawler.output.error(message)
61
61
  end
62
62
 
63
- def record_response(code, link, from_url)
63
+ def record_response(code, link, from_url, redirection=nil)
64
64
  message = "#{code} - #{link}"
65
65
 
66
66
  if code.to_i >= 300
67
67
  message += " - Called from: #{from_url}"
68
68
  end
69
+
70
+ message += " - Following redirection to: #{redirection}" if redirection
69
71
 
70
72
  code = code.to_i
71
73
  case code / 100
@@ -19,10 +19,10 @@ module Rawler
19
19
 
20
20
  doc = Nokogiri::HTML(response.body)
21
21
  doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
22
- rescue Errno::ECONNREFUSED
22
+ rescue Errno::ECONNREFUSED # TODO: add called from
23
23
  write("Couldn't connect to #{url}")
24
24
  []
25
- rescue Errno::ETIMEDOUT
25
+ rescue Errno::ETIMEDOUT # TODO: add called from
26
26
  write("Connection to #{url} timed out")
27
27
  []
28
28
  end
@@ -38,8 +38,12 @@ module Rawler
38
38
  else
39
39
  path
40
40
  end
41
+ rescue URI::InvalidURIError
42
+ write("Invalid url: #{path} - Called from: #{url}")
43
+ nil
41
44
  end
42
45
 
46
+ # TODO: add 'called from in a more pragmatic way as an optional parameter
43
47
  def write(message)
44
48
  Rawler.output.error(message)
45
49
  end
@@ -5,7 +5,7 @@ require File.dirname(__FILE__) + '/../../spec_helper.rb'
5
5
  describe Rawler::Crawler do
6
6
 
7
7
  let(:url) { 'http://example.com' }
8
- let(:output) { double("output", :error => nil) }
8
+ let(:output) { double('output', :error => nil) }
9
9
 
10
10
  before(:each) do
11
11
  Rawler.stub!(:url).and_return(url)
@@ -164,11 +164,9 @@ describe Rawler::Crawler do
164
164
 
165
165
  let(:url) { 'http://example.com' }
166
166
  let(:crawler) { Rawler::Crawler.new(url) }
167
- let(:output) { double('output', :error => nil) }
168
167
 
169
168
  before(:each) do
170
169
  register(url, '')
171
- Rawler.stub!(:output).and_return(output)
172
170
  end
173
171
 
174
172
  context "Errno::ECONNREFUSED" do
@@ -250,5 +248,20 @@ describe Rawler::Crawler do
250
248
  crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
251
249
  end
252
250
  end
251
+
252
+ context "invalid urls" do
253
+ let(:content) { '<a href="http://foo;bar">foo</a>' }
254
+ let(:url) { 'http://example.com' }
255
+ let(:crawler) { Rawler::Crawler.new(url) }
256
+
257
+ before(:each) do
258
+ register(url, content)
259
+ end
260
+
261
+ it "should notify about the invalid url" do
262
+ output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
263
+ crawler.links.should == []
264
+ end
265
+ end
253
266
 
254
267
  end
@@ -53,16 +53,27 @@ describe Rawler::Base do
53
53
  register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
54
54
  register('http://example.com/foo2', '')
55
55
  register('http://external.com', '')
56
- register('http://external.com/foo', '', 302)
56
+ register('http://external.com/foo', '', 301)
57
57
 
58
58
  output.should_receive(:info).with('200 - http://example.com/foo1')
59
59
  output.should_receive(:info).with('200 - http://example.com/foo2')
60
60
  output.should_receive(:info).with('200 - http://external.com')
61
- output.should_receive(:warn).with('302 - http://external.com/foo - Called from: http://example.com/foo1')
61
+ output.should_receive(:warn).with('301 - http://external.com/foo - Called from: http://example.com/foo1')
62
62
 
63
63
  rawler.validate
64
64
  end
65
65
 
66
+ it "should follow redirections but inform about them" do
67
+ register('http://example.com', '<a href="/foo">foo</a>')
68
+ register('http://example.com/foo', '', 301, :location => 'http://example.com/bar')
69
+ register('http://example.com/bar', '')
70
+
71
+ output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/bar')
72
+ output.should_receive(:info).with('200 - http://example.com/bar')
73
+
74
+ rawler.validate
75
+ end
76
+
66
77
  end
67
78
 
68
79
  describe "get_status_code" do
@@ -77,14 +88,14 @@ describe Rawler::Base do
77
88
  rawler.responses[url][:status].should == 200
78
89
  end
79
90
 
80
- it "should add to 302 links" do
91
+ it "should add to 301 links" do
81
92
  url = 'http://example.com/foo'
82
93
  from = 'http://other.com'
83
- register(url, '', 302)
94
+ register(url, '', 301)
84
95
 
85
96
  rawler.send(:add_status_code, url, from)
86
97
 
87
- rawler.responses[url][:status].should == 302
98
+ rawler.responses[url][:status].should == 301
88
99
  end
89
100
 
90
101
  it "should save username and password" do
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 8
9
- version: 0.0.8
8
+ - 9
9
+ version: 0.0.9
10
10
  platform: ruby
11
11
  authors:
12
12
  - Oscar Del Ben
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-17 00:00:00 +01:00
17
+ date: 2011-03-25 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency