rawler 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.txt CHANGED
@@ -4,11 +4,19 @@
4
4
 
5
5
  == DESCRIPTION:
6
6
 
7
- Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
7
+ Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
8
+
9
+ Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
8
10
 
9
11
  == SYNOPSIS:
10
12
 
11
- rawler http://example.com
13
+ rawler http://example.com [options]
14
+
15
+ where [options] are:
16
+ --username, -u <s>: HTT Basic Username
17
+ --password, -p <s>: HTT Basic Password
18
+ --version, -v: Print version and exit
19
+ --help, -h: Show this message
12
20
 
13
21
  == INSTALL:
14
22
 
@@ -16,10 +24,7 @@ gem install rawler
16
24
 
17
25
  == TODO
18
26
 
19
- * Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
20
27
  * Export to html
21
- * Handle multiple urls at once
22
- * Add user agent
23
28
 
24
29
  == LICENSE:
25
30
 
data/bin/rawler CHANGED
@@ -1,11 +1,33 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
4
+ require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
4
5
 
5
- domain = ARGV[0]
6
+ opts = Trollop::options do
7
+ version "rawler 0.0.3 (c) 2011 Oscar Del Ben"
8
+ banner <<-EOS
9
+ Rawler is a command line utility for parsing links on a website
10
+
11
+ Usage:
12
+ rawler http://example.com [options]
13
+
14
+ where [options] are:
15
+ EOS
16
+
17
+ # opt :domain, "domain that you want to test", :type => :string
18
+ opt :username, "HTT Basic Username", :type => :string
19
+ opt :password, "HTT Basic Password", :type => :string
20
+ end
21
+
22
+ domain = ARGV.shift
6
23
 
7
24
  if domain.nil?
8
- puts "Usage: rawler http://example.com"
25
+ Trollop::die "Domain name is mandatory. Type --help for help"
26
+ else
27
+ Trollop::options do
28
+ opt :domain, "Domain address", :type => :string
29
+ end
9
30
  end
10
31
 
11
- Rawler::Base.new(domain, $stdout).validate
32
+ Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate
33
+
@@ -1,13 +1,22 @@
1
1
  require 'rubygems'
2
2
  require 'net/http'
3
+ require 'net/https'
3
4
  require 'nokogiri'
4
5
 
5
6
  $:.unshift(File.dirname(__FILE__)) unless
6
7
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
7
8
 
9
+ require 'rawler/core_extensions'
10
+
8
11
  module Rawler
9
- VERSION = '0.0.2'
12
+ VERSION = '0.0.3'
13
+
14
+ mattr_accessor :output
15
+ mattr_accessor :url
16
+
17
+ mattr_accessor :username, :password
10
18
 
11
19
  autoload :Base, "rawler/base"
12
20
  autoload :Crawler, "rawler/crawler"
21
+ autoload :Request, "rawler/request"
13
22
  end
@@ -2,16 +2,19 @@ module Rawler
2
2
 
3
3
  class Base
4
4
 
5
- attr_accessor :url, :responses
5
+ attr_accessor :responses
6
6
 
7
- def initialize(url, output)
8
- @url = url
7
+ def initialize(url, output, username=nil, password=nil)
9
8
  @responses = {}
10
- $output = output
9
+
10
+ Rawler.url = url
11
+ Rawler.output = output
12
+ Rawler.username = username
13
+ Rawler.password = password
11
14
  end
12
15
 
13
16
  def validate
14
- validate_links_in_page(url)
17
+ validate_links_in_page(Rawler.url)
15
18
  end
16
19
 
17
20
  private
@@ -30,32 +33,29 @@ module Rawler
30
33
  end
31
34
 
32
35
  def add_status_code(link)
33
- uri = URI.parse(link)
34
-
35
- response = nil
36
-
37
- Net::HTTP.start(uri.host, uri.port) do |http|
38
- path = (uri.path.size == 0) ? "/" : uri.path
39
- response = http.head(path, {'User-Agent'=>'Rawler'})
40
- end
36
+ response = Rawler::Request.get(link)
41
37
 
42
- $output.puts("#{response.code} - #{link}")
38
+ write("#{response.code} - #{link}")
43
39
  responses[link] = { :status => response.code.to_i }
44
40
  rescue Errno::ECONNREFUSED
45
- puts "Connection refused - '#{link}'"
41
+ write("Connection refused - '#{link}'")
46
42
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
47
43
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
48
- puts "Connection problems - #{link}"
44
+ write("Connection problems - '#{link}'")
49
45
  end
50
46
 
51
47
  def same_domain?(link)
52
- URI.parse(url).host == URI.parse(link).host
48
+ URI.parse(Rawler.url).host == URI.parse(link).host
53
49
  end
54
50
 
55
51
  def not_yet_parsed?(link)
56
52
  responses[link].nil?
57
53
  end
58
54
 
55
+ def write(message)
56
+ Rawler.output.puts(message)
57
+ end
58
+
59
59
  end
60
60
 
61
61
  end
@@ -9,12 +9,16 @@ module Rawler
9
9
  end
10
10
 
11
11
  def links
12
- content = Net::HTTP.get(URI.parse(url))
12
+ if different_domain?(url, Rawler.url) || not_html?(url)
13
+ return []
14
+ end
13
15
 
14
- doc = Nokogiri::HTML(content)
15
- doc.css('a').map { |a| absolute_url(a['href']) }
16
+ response = Rawler::Request.get(url)
17
+
18
+ doc = Nokogiri::HTML(response.body)
19
+ doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
16
20
  rescue Errno::ECONNREFUSED
17
- $output.puts "Couldn't connect to #{url}"
21
+ write("Couldn't connect to #{url}")
18
22
  []
19
23
  end
20
24
 
@@ -23,6 +27,24 @@ module Rawler
23
27
  def absolute_url(path)
24
28
  URI.parse(url).merge(path.to_s).to_s
25
29
  end
30
+
31
+ def write(message)
32
+ Rawler.output.puts(message)
33
+ end
34
+
35
+ def different_domain?(url_1, url_2)
36
+ URI.parse(url_1).host != URI.parse(url_2).host
37
+ end
38
+
39
+ def not_html?(url)
40
+ Rawler::Request.head(url).content_type != 'text/html'
41
+ end
42
+
43
+ def valid_url?(url)
44
+ scheme = URI.parse(url).scheme
45
+
46
+ ['http', 'https'].include?(scheme)
47
+ end
26
48
 
27
49
  end
28
50
 
@@ -5,6 +5,6 @@ require 'fakeweb'
5
5
 
6
6
  FakeWeb.allow_net_connect = false
7
7
 
8
- def register(uri, content, status=200)
9
- FakeWeb.register_uri(:any, uri, :body => content, :status => status)
8
+ def register(uri, content, status=200, options={})
9
+ FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
10
10
  end
@@ -72,6 +72,36 @@ describe Rawler::Base do
72
72
  rawler.responses[url][:status].should == 302
73
73
  end
74
74
 
75
+ it "should save username and password" do
76
+ rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
77
+
78
+ Rawler.username.should == 'my_user'
79
+ Rawler.password.should == 'secret'
80
+ end
81
+
82
+ it "should rescue from Errno::ECONNREFUSED" do
83
+ url = 'http://example.com'
84
+
85
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
86
+
87
+ output.should_receive(:puts).with("Connection refused - '#{url}'")
88
+
89
+ rawler.send(:add_status_code, url)
90
+ end
91
+
92
+ [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
93
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
94
+ it "should rescue from #{error}" do
95
+ url = 'http://example.com'
96
+
97
+ Rawler::Request.should_receive(:get).and_raise error
98
+
99
+ output.should_receive(:puts).with("Connection problems - '#{url}'")
100
+
101
+ rawler.send(:add_status_code, url)
102
+ end
103
+ end
104
+
75
105
  end
76
106
 
77
107
 
@@ -2,40 +2,96 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
2
2
 
3
3
  describe Rawler::Crawler do
4
4
 
5
+ let(:url) { 'http://example.com' }
6
+
7
+ before(:each) do
8
+ Rawler.stub!(:url).and_return(url)
9
+ end
10
+
5
11
  it "should parse all links" do
6
- url = 'http://example.com/'
7
12
  register(url, site)
8
13
 
9
14
  Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
10
15
  end
11
16
 
17
+ it "should parse relative links" do
18
+ url = 'http://example.com/path'
19
+ register(url, '<a href="/foo">foo</a>')
20
+
21
+ Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
22
+ end
23
+
24
+ it "should parse links only if the page is in the same domain as the main url" do
25
+ url = 'http://external.com/path'
26
+ register(url, '<a href="/foo">foo</a>')
27
+
28
+ Rawler.should_receive(:url).and_return('http://example.com')
29
+
30
+ Rawler::Crawler.new(url).links.should == []
31
+ end
32
+
12
33
  it "should return an empty array when raising Errno::ECONNREFUSED" do
13
- url = 'http://example.com'
14
34
  register(url, site)
35
+ crawler = Rawler::Crawler.new(url)
15
36
 
16
- Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
37
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
17
38
 
18
- crawler = Rawler::Crawler.new(url).links.should == []
39
+ crawler.links.should == []
19
40
  end
20
41
 
21
- it "should parse relative links" do
22
- url = 'http://example.com/path'
23
- register(url, '<a href="/foo">foo</a>')
42
+ it "should print a message when raising Errno::ECONNREFUSED" do
43
+ output = double('output')
44
+ register(url, site)
45
+
46
+ crawler = Rawler::Crawler.new(url)
47
+
48
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
49
+ Rawler.should_receive(:output).and_return(output)
50
+ output.should_receive(:puts).with("Couldn't connect to #{url}")
51
+
52
+ crawler.links
53
+ end
54
+
55
+ context "should ignore content type other than text/html" do
56
+
57
+ ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
58
+
59
+ it "should ignore '#{content_type}'" do
60
+ register(url, site, 200, :content_type => content_type)
61
+
62
+ crawler = Rawler::Crawler.new(url)
63
+ crawler.links.should == []
64
+ end
65
+
66
+ end
24
67
 
25
- Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
26
68
  end
27
69
 
28
- # it "should print a message when raising Errno::ECONNREFUSED" do
29
- # pending "refactor output. Don't use a global variable"
30
- # url = 'http://example.com'
31
- # register(url, site)
32
- #
33
- # Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
34
- #
35
- # $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
36
- #
37
- # Rawler::Crawler.new(url).links
38
- # end
70
+ it "should ignore links other than http or https" do
71
+ content = <<-content
72
+ <a href="http://example.com/valid">foo</a>
73
+ <a href="mailto:info@example.com">invalid</a>
74
+ <a href="https://foo.com">valid</a>
75
+ content
76
+
77
+ register(url, content)
78
+
79
+ crawler = Rawler::Crawler.new(url)
80
+ crawler.links.should == ['http://example.com/valid', 'https://foo.com']
81
+ end
82
+
83
+ it "should crawl http basic pages" do
84
+ content = '<a href="http://example.com/secret-path">foo</a>'
85
+
86
+ register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
87
+ register('http://foo:bar@example.com/secret', content)
88
+
89
+ Rawler.stub!(:username).and_return('foo')
90
+ Rawler.stub!(:password).and_return('bar')
91
+
92
+ crawler = Rawler::Crawler.new('http://example.com/secret')
93
+ crawler.links.should == ['http://example.com/secret-path']
94
+ end
39
95
 
40
96
  private
41
97
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Oscar Del Ben
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-10 00:00:00 +01:00
18
+ date: 2011-01-11 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -48,7 +48,10 @@ dependencies:
48
48
  version: 2.8.0
49
49
  type: :development
50
50
  version_requirements: *id002
51
- description: Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
51
+ description: |-
52
+ Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
53
+
54
+ Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
52
55
  email:
53
56
  - info@oscardelben.com
54
57
  executables:
@@ -107,6 +110,6 @@ rubyforge_project: oscardelben
107
110
  rubygems_version: 1.4.1
108
111
  signing_key:
109
112
  specification_version: 3
110
- summary: Rawler is a Ruby library that crawls your website and see the status code of each of your links
113
+ summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
111
114
  test_files: []
112
115