rawler 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.txt CHANGED
@@ -13,8 +13,8 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
13
13
  rawler http://example.com [options]
14
14
 
15
15
  where [options] are:
16
- --username, -u <s>: HTT Basic Username
17
- --password, -p <s>: HTT Basic Password
16
+ --username, -u <s>: HTTP Basic Username
17
+ --password, -p <s>: HTTP Basic Password
18
18
  --version, -v: Print version and exit
19
19
  --help, -h: Show this message
20
20
 
@@ -40,13 +40,6 @@ If you add files, run:
40
40
 
41
41
  And add them to the Manifest file.
42
42
 
43
- == TODO
44
-
45
- * Add logger levels
46
- * Follow redirects, but still inform about them
47
- * Respect robots.txt
48
- * Export to html
49
-
50
43
  == CONTRIBUTORS:
51
44
 
52
45
  * bcoob
@@ -7,7 +7,7 @@ require 'logger'
7
7
  require 'rawler/core_extensions'
8
8
 
9
9
  module Rawler
10
- VERSION = '0.0.8'
10
+ VERSION = '0.0.9'
11
11
 
12
12
  mattr_accessor :output
13
13
  mattr_accessor :url
@@ -36,14 +36,16 @@ module Rawler
36
36
 
37
37
  def add_status_code(link, from_url)
38
38
  response = Rawler::Request.get(link)
39
+
40
+ validate_page(response['Location'], from_url) if response['Location']
39
41
 
40
- record_response(response.code, link, from_url)
42
+ record_response(response.code, link, from_url, response['Location'])
41
43
  responses[link] = { :status => response.code.to_i }
42
44
  rescue Errno::ECONNREFUSED
43
- Rawler.output.error("Connection refused - #{link} - Called from: #{from_url}")
45
+ error("Connection refused - #{link} - Called from: #{from_url}")
44
46
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
45
47
  EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
46
- Rawler.output.error("Connection problems - #{link} - Called from: #{from_url}")
48
+ error("Connection problems - #{link} - Called from: #{from_url}")
47
49
  end
48
50
 
49
51
  def same_domain?(link)
@@ -54,18 +56,18 @@ module Rawler
54
56
  responses[link].nil?
55
57
  end
56
58
 
57
- def write(message)
58
- # TODO: This may not always be an error message,
59
- # but that will make it show up most of the time
59
+ def error(message)
60
60
  Rawler.output.error(message)
61
61
  end
62
62
 
63
- def record_response(code, link, from_url)
63
+ def record_response(code, link, from_url, redirection=nil)
64
64
  message = "#{code} - #{link}"
65
65
 
66
66
  if code.to_i >= 300
67
67
  message += " - Called from: #{from_url}"
68
68
  end
69
+
70
+ message += " - Following redirection to: #{redirection}" if redirection
69
71
 
70
72
  code = code.to_i
71
73
  case code / 100
@@ -19,10 +19,10 @@ module Rawler
19
19
 
20
20
  doc = Nokogiri::HTML(response.body)
21
21
  doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
22
- rescue Errno::ECONNREFUSED
22
+ rescue Errno::ECONNREFUSED # TODO: add called from
23
23
  write("Couldn't connect to #{url}")
24
24
  []
25
- rescue Errno::ETIMEDOUT
25
+ rescue Errno::ETIMEDOUT # TODO: add called from
26
26
  write("Connection to #{url} timed out")
27
27
  []
28
28
  end
@@ -38,8 +38,12 @@ module Rawler
38
38
  else
39
39
  path
40
40
  end
41
+ rescue URI::InvalidURIError
42
+ write("Invalid url: #{path} - Called from: #{url}")
43
+ nil
41
44
  end
42
45
 
46
+ # TODO: add 'called from in a more pragmatic way as an optional parameter
43
47
  def write(message)
44
48
  Rawler.output.error(message)
45
49
  end
@@ -5,7 +5,7 @@ require File.dirname(__FILE__) + '/../../spec_helper.rb'
5
5
  describe Rawler::Crawler do
6
6
 
7
7
  let(:url) { 'http://example.com' }
8
- let(:output) { double("output", :error => nil) }
8
+ let(:output) { double('output', :error => nil) }
9
9
 
10
10
  before(:each) do
11
11
  Rawler.stub!(:url).and_return(url)
@@ -164,11 +164,9 @@ describe Rawler::Crawler do
164
164
 
165
165
  let(:url) { 'http://example.com' }
166
166
  let(:crawler) { Rawler::Crawler.new(url) }
167
- let(:output) { double('output', :error => nil) }
168
167
 
169
168
  before(:each) do
170
169
  register(url, '')
171
- Rawler.stub!(:output).and_return(output)
172
170
  end
173
171
 
174
172
  context "Errno::ECONNREFUSED" do
@@ -250,5 +248,20 @@ describe Rawler::Crawler do
250
248
  crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com']
251
249
  end
252
250
  end
251
+
252
+ context "invalid urls" do
253
+ let(:content) { '<a href="http://foo;bar">foo</a>' }
254
+ let(:url) { 'http://example.com' }
255
+ let(:crawler) { Rawler::Crawler.new(url) }
256
+
257
+ before(:each) do
258
+ register(url, content)
259
+ end
260
+
261
+ it "should notify about the invalid url" do
262
+ output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com')
263
+ crawler.links.should == []
264
+ end
265
+ end
253
266
 
254
267
  end
@@ -53,16 +53,27 @@ describe Rawler::Base do
53
53
  register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
54
54
  register('http://example.com/foo2', '')
55
55
  register('http://external.com', '')
56
- register('http://external.com/foo', '', 302)
56
+ register('http://external.com/foo', '', 301)
57
57
 
58
58
  output.should_receive(:info).with('200 - http://example.com/foo1')
59
59
  output.should_receive(:info).with('200 - http://example.com/foo2')
60
60
  output.should_receive(:info).with('200 - http://external.com')
61
- output.should_receive(:warn).with('302 - http://external.com/foo - Called from: http://example.com/foo1')
61
+ output.should_receive(:warn).with('301 - http://external.com/foo - Called from: http://example.com/foo1')
62
62
 
63
63
  rawler.validate
64
64
  end
65
65
 
66
+ it "should follow redirections but inform about them" do
67
+ register('http://example.com', '<a href="/foo">foo</a>')
68
+ register('http://example.com/foo', '', 301, :location => 'http://example.com/bar')
69
+ register('http://example.com/bar', '')
70
+
71
+ output.should_receive(:warn).with('301 - http://example.com/foo - Called from: http://example.com - Following redirection to: http://example.com/bar')
72
+ output.should_receive(:info).with('200 - http://example.com/bar')
73
+
74
+ rawler.validate
75
+ end
76
+
66
77
  end
67
78
 
68
79
  describe "get_status_code" do
@@ -77,14 +88,14 @@ describe Rawler::Base do
77
88
  rawler.responses[url][:status].should == 200
78
89
  end
79
90
 
80
- it "should add to 302 links" do
91
+ it "should add to 301 links" do
81
92
  url = 'http://example.com/foo'
82
93
  from = 'http://other.com'
83
- register(url, '', 302)
94
+ register(url, '', 301)
84
95
 
85
96
  rawler.send(:add_status_code, url, from)
86
97
 
87
- rawler.responses[url][:status].should == 302
98
+ rawler.responses[url][:status].should == 301
88
99
  end
89
100
 
90
101
  it "should save username and password" do
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 8
9
- version: 0.0.8
8
+ - 9
9
+ version: 0.0.9
10
10
  platform: ruby
11
11
  authors:
12
12
  - Oscar Del Ben
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-17 00:00:00 +01:00
17
+ date: 2011-03-25 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency