rawler 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.txt CHANGED
@@ -4,11 +4,19 @@
4
4
 
5
5
  == DESCRIPTION:
6
6
 
7
- Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
7
+ Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
8
+
9
+ Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
8
10
 
9
11
  == SYNOPSIS:
10
12
 
11
- rawler http://example.com
13
+ rawler http://example.com [options]
14
+
15
+ where [options] are:
16
+ --username, -u <s>: HTT Basic Username
17
+ --password, -p <s>: HTT Basic Password
18
+ --version, -v: Print version and exit
19
+ --help, -h: Show this message
12
20
 
13
21
  == INSTALL:
14
22
 
@@ -16,10 +24,7 @@ gem install rawler
16
24
 
17
25
  == TODO
18
26
 
19
- * Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
20
27
  * Export to html
21
- * Handle multiple urls at once
22
- * Add user agent
23
28
 
24
29
  == LICENSE:
25
30
 
data/bin/rawler CHANGED
@@ -1,11 +1,33 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
4
+ require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
4
5
 
5
- domain = ARGV[0]
6
+ opts = Trollop::options do
7
+ version "rawler 0.0.3 (c) 2011 Oscar Del Ben"
8
+ banner <<-EOS
9
+ Rawler is a command line utility for parsing links on a website
10
+
11
+ Usage:
12
+ rawler http://example.com [options]
13
+
14
+ where [options] are:
15
+ EOS
16
+
17
+ # opt :domain, "domain that you want to test", :type => :string
18
+ opt :username, "HTT Basic Username", :type => :string
19
+ opt :password, "HTT Basic Password", :type => :string
20
+ end
21
+
22
+ domain = ARGV.shift
6
23
 
7
24
  if domain.nil?
8
- puts "Usage: rawler http://example.com"
25
+ Trollop::die "Domain name is mandatory. Type --help for help"
26
+ else
27
+ Trollop::options do
28
+ opt :domain, "Domain address", :type => :string
29
+ end
9
30
  end
10
31
 
11
- Rawler::Base.new(domain, $stdout).validate
32
+ Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate
33
+
@@ -1,13 +1,22 @@
1
1
  require 'rubygems'
2
2
  require 'net/http'
3
+ require 'net/https'
3
4
  require 'nokogiri'
4
5
 
5
6
  $:.unshift(File.dirname(__FILE__)) unless
6
7
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
7
8
 
9
+ require 'rawler/core_extensions'
10
+
8
11
  module Rawler
9
- VERSION = '0.0.2'
12
+ VERSION = '0.0.3'
13
+
14
+ mattr_accessor :output
15
+ mattr_accessor :url
16
+
17
+ mattr_accessor :username, :password
10
18
 
11
19
  autoload :Base, "rawler/base"
12
20
  autoload :Crawler, "rawler/crawler"
21
+ autoload :Request, "rawler/request"
13
22
  end
@@ -2,16 +2,19 @@ module Rawler
2
2
 
3
3
  class Base
4
4
 
5
- attr_accessor :url, :responses
5
+ attr_accessor :responses
6
6
 
7
- def initialize(url, output)
8
- @url = url
7
+ def initialize(url, output, username=nil, password=nil)
9
8
  @responses = {}
10
- $output = output
9
+
10
+ Rawler.url = url
11
+ Rawler.output = output
12
+ Rawler.username = username
13
+ Rawler.password = password
11
14
  end
12
15
 
13
16
  def validate
14
- validate_links_in_page(url)
17
+ validate_links_in_page(Rawler.url)
15
18
  end
16
19
 
17
20
  private
@@ -30,32 +33,29 @@ module Rawler
30
33
  end
31
34
 
32
35
  def add_status_code(link)
33
- uri = URI.parse(link)
34
-
35
- response = nil
36
-
37
- Net::HTTP.start(uri.host, uri.port) do |http|
38
- path = (uri.path.size == 0) ? "/" : uri.path
39
- response = http.head(path, {'User-Agent'=>'Rawler'})
40
- end
36
+ response = Rawler::Request.get(link)
41
37
 
42
- $output.puts("#{response.code} - #{link}")
38
+ write("#{response.code} - #{link}")
43
39
  responses[link] = { :status => response.code.to_i }
44
40
  rescue Errno::ECONNREFUSED
45
- puts "Connection refused - '#{link}'"
41
+ write("Connection refused - '#{link}'")
46
42
  rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
47
43
  Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
48
- puts "Connection problems - #{link}"
44
+ write("Connection problems - '#{link}'")
49
45
  end
50
46
 
51
47
  def same_domain?(link)
52
- URI.parse(url).host == URI.parse(link).host
48
+ URI.parse(Rawler.url).host == URI.parse(link).host
53
49
  end
54
50
 
55
51
  def not_yet_parsed?(link)
56
52
  responses[link].nil?
57
53
  end
58
54
 
55
+ def write(message)
56
+ Rawler.output.puts(message)
57
+ end
58
+
59
59
  end
60
60
 
61
61
  end
@@ -9,12 +9,16 @@ module Rawler
9
9
  end
10
10
 
11
11
  def links
12
- content = Net::HTTP.get(URI.parse(url))
12
+ if different_domain?(url, Rawler.url) || not_html?(url)
13
+ return []
14
+ end
13
15
 
14
- doc = Nokogiri::HTML(content)
15
- doc.css('a').map { |a| absolute_url(a['href']) }
16
+ response = Rawler::Request.get(url)
17
+
18
+ doc = Nokogiri::HTML(response.body)
19
+ doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
16
20
  rescue Errno::ECONNREFUSED
17
- $output.puts "Couldn't connect to #{url}"
21
+ write("Couldn't connect to #{url}")
18
22
  []
19
23
  end
20
24
 
@@ -23,6 +27,24 @@ module Rawler
23
27
  def absolute_url(path)
24
28
  URI.parse(url).merge(path.to_s).to_s
25
29
  end
30
+
31
+ def write(message)
32
+ Rawler.output.puts(message)
33
+ end
34
+
35
+ def different_domain?(url_1, url_2)
36
+ URI.parse(url_1).host != URI.parse(url_2).host
37
+ end
38
+
39
+ def not_html?(url)
40
+ Rawler::Request.head(url).content_type != 'text/html'
41
+ end
42
+
43
+ def valid_url?(url)
44
+ scheme = URI.parse(url).scheme
45
+
46
+ ['http', 'https'].include?(scheme)
47
+ end
26
48
 
27
49
  end
28
50
 
@@ -5,6 +5,6 @@ require 'fakeweb'
5
5
 
6
6
  FakeWeb.allow_net_connect = false
7
7
 
8
- def register(uri, content, status=200)
9
- FakeWeb.register_uri(:any, uri, :body => content, :status => status)
8
+ def register(uri, content, status=200, options={})
9
+ FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
10
10
  end
@@ -72,6 +72,36 @@ describe Rawler::Base do
72
72
  rawler.responses[url][:status].should == 302
73
73
  end
74
74
 
75
+ it "should save username and password" do
76
+ rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
77
+
78
+ Rawler.username.should == 'my_user'
79
+ Rawler.password.should == 'secret'
80
+ end
81
+
82
+ it "should rescue from Errno::ECONNREFUSED" do
83
+ url = 'http://example.com'
84
+
85
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
86
+
87
+ output.should_receive(:puts).with("Connection refused - '#{url}'")
88
+
89
+ rawler.send(:add_status_code, url)
90
+ end
91
+
92
+ [Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
93
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
94
+ it "should rescue from #{error}" do
95
+ url = 'http://example.com'
96
+
97
+ Rawler::Request.should_receive(:get).and_raise error
98
+
99
+ output.should_receive(:puts).with("Connection problems - '#{url}'")
100
+
101
+ rawler.send(:add_status_code, url)
102
+ end
103
+ end
104
+
75
105
  end
76
106
 
77
107
 
@@ -2,40 +2,96 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
2
2
 
3
3
  describe Rawler::Crawler do
4
4
 
5
+ let(:url) { 'http://example.com' }
6
+
7
+ before(:each) do
8
+ Rawler.stub!(:url).and_return(url)
9
+ end
10
+
5
11
  it "should parse all links" do
6
- url = 'http://example.com/'
7
12
  register(url, site)
8
13
 
9
14
  Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
10
15
  end
11
16
 
17
+ it "should parse relative links" do
18
+ url = 'http://example.com/path'
19
+ register(url, '<a href="/foo">foo</a>')
20
+
21
+ Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
22
+ end
23
+
24
+ it "should parse links only if the page is in the same domain as the main url" do
25
+ url = 'http://external.com/path'
26
+ register(url, '<a href="/foo">foo</a>')
27
+
28
+ Rawler.should_receive(:url).and_return('http://example.com')
29
+
30
+ Rawler::Crawler.new(url).links.should == []
31
+ end
32
+
12
33
  it "should return an empty array when raising Errno::ECONNREFUSED" do
13
- url = 'http://example.com'
14
34
  register(url, site)
35
+ crawler = Rawler::Crawler.new(url)
15
36
 
16
- Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
37
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
17
38
 
18
- crawler = Rawler::Crawler.new(url).links.should == []
39
+ crawler.links.should == []
19
40
  end
20
41
 
21
- it "should parse relative links" do
22
- url = 'http://example.com/path'
23
- register(url, '<a href="/foo">foo</a>')
42
+ it "should print a message when raising Errno::ECONNREFUSED" do
43
+ output = double('output')
44
+ register(url, site)
45
+
46
+ crawler = Rawler::Crawler.new(url)
47
+
48
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
49
+ Rawler.should_receive(:output).and_return(output)
50
+ output.should_receive(:puts).with("Couldn't connect to #{url}")
51
+
52
+ crawler.links
53
+ end
54
+
55
+ context "should ignore content type other than text/html" do
56
+
57
+ ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
58
+
59
+ it "should ignore '#{content_type}'" do
60
+ register(url, site, 200, :content_type => content_type)
61
+
62
+ crawler = Rawler::Crawler.new(url)
63
+ crawler.links.should == []
64
+ end
65
+
66
+ end
24
67
 
25
- Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
26
68
  end
27
69
 
28
- # it "should print a message when raising Errno::ECONNREFUSED" do
29
- # pending "refactor output. Don't use a global variable"
30
- # url = 'http://example.com'
31
- # register(url, site)
32
- #
33
- # Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
34
- #
35
- # $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
36
- #
37
- # Rawler::Crawler.new(url).links
38
- # end
70
+ it "should ignore links other than http or https" do
71
+ content = <<-content
72
+ <a href="http://example.com/valid">foo</a>
73
+ <a href="mailto:info@example.com">invalid</a>
74
+ <a href="https://foo.com">valid</a>
75
+ content
76
+
77
+ register(url, content)
78
+
79
+ crawler = Rawler::Crawler.new(url)
80
+ crawler.links.should == ['http://example.com/valid', 'https://foo.com']
81
+ end
82
+
83
+ it "should crawl http basic pages" do
84
+ content = '<a href="http://example.com/secret-path">foo</a>'
85
+
86
+ register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
87
+ register('http://foo:bar@example.com/secret', content)
88
+
89
+ Rawler.stub!(:username).and_return('foo')
90
+ Rawler.stub!(:password).and_return('bar')
91
+
92
+ crawler = Rawler::Crawler.new('http://example.com/secret')
93
+ crawler.links.should == ['http://example.com/secret-path']
94
+ end
39
95
 
40
96
  private
41
97
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rawler
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Oscar Del Ben
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-10 00:00:00 +01:00
18
+ date: 2011-01-11 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -48,7 +48,10 @@ dependencies:
48
48
  version: 2.8.0
49
49
  type: :development
50
50
  version_requirements: *id002
51
- description: Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
51
+ description: |-
52
+ Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
53
+
54
+ Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
52
55
  email:
53
56
  - info@oscardelben.com
54
57
  executables:
@@ -107,6 +110,6 @@ rubyforge_project: oscardelben
107
110
  rubygems_version: 1.4.1
108
111
  signing_key:
109
112
  specification_version: 3
110
- summary: Rawler is a Ruby library that crawls your website and see the status code of each of your links
113
+ summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
111
114
  test_files: []
112
115