rawler 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +10 -5
- data/bin/rawler +25 -3
- data/lib/rawler.rb +10 -1
- data/lib/rawler/base.rb +17 -17
- data/lib/rawler/crawler.rb +26 -4
- data/spec/spec_helper.rb +2 -2
- data/spec/unit/base_spec.rb +30 -0
- data/spec/unit/crawler_spec.rb +75 -19
- metadata +9 -6
data/README.txt
CHANGED
@@ -4,11 +4,19 @@
|
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
Rawler is a Ruby library that crawls your website and
|
7
|
+
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
8
|
+
|
9
|
+
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
8
10
|
|
9
11
|
== SYNOPSIS:
|
10
12
|
|
11
|
-
rawler http://example.com
|
13
|
+
rawler http://example.com [options]
|
14
|
+
|
15
|
+
where [options] are:
|
16
|
+
--username, -u <s>: HTT Basic Username
|
17
|
+
--password, -p <s>: HTT Basic Password
|
18
|
+
--version, -v: Print version and exit
|
19
|
+
--help, -h: Show this message
|
12
20
|
|
13
21
|
== INSTALL:
|
14
22
|
|
@@ -16,10 +24,7 @@ gem install rawler
|
|
16
24
|
|
17
25
|
== TODO
|
18
26
|
|
19
|
-
* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
|
20
27
|
* Export to html
|
21
|
-
* Handle multiple urls at once
|
22
|
-
* Add user agent
|
23
28
|
|
24
29
|
== LICENSE:
|
25
30
|
|
data/bin/rawler
CHANGED
@@ -1,11 +1,33 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
|
4
|
+
require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
|
4
5
|
|
5
|
-
|
6
|
+
opts = Trollop::options do
|
7
|
+
version "rawler 0.0.3 (c) 2011 Oscar Del Ben"
|
8
|
+
banner <<-EOS
|
9
|
+
Rawler is a command line utility for parsing links on a website
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
rawler http://example.com [options]
|
13
|
+
|
14
|
+
where [options] are:
|
15
|
+
EOS
|
16
|
+
|
17
|
+
# opt :domain, "domain that you want to test", :type => :string
|
18
|
+
opt :username, "HTT Basic Username", :type => :string
|
19
|
+
opt :password, "HTT Basic Password", :type => :string
|
20
|
+
end
|
21
|
+
|
22
|
+
domain = ARGV.shift
|
6
23
|
|
7
24
|
if domain.nil?
|
8
|
-
|
25
|
+
Trollop::die "Domain name is mandatory. Type --help for help"
|
26
|
+
else
|
27
|
+
Trollop::options do
|
28
|
+
opt :domain, "Domain address", :type => :string
|
29
|
+
end
|
9
30
|
end
|
10
31
|
|
11
|
-
Rawler::Base.new(domain, $stdout).validate
|
32
|
+
Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate
|
33
|
+
|
data/lib/rawler.rb
CHANGED
@@ -1,13 +1,22 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'net/http'
|
3
|
+
require 'net/https'
|
3
4
|
require 'nokogiri'
|
4
5
|
|
5
6
|
$:.unshift(File.dirname(__FILE__)) unless
|
6
7
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
7
8
|
|
9
|
+
require 'rawler/core_extensions'
|
10
|
+
|
8
11
|
module Rawler
|
9
|
-
VERSION = '0.0.
|
12
|
+
VERSION = '0.0.3'
|
13
|
+
|
14
|
+
mattr_accessor :output
|
15
|
+
mattr_accessor :url
|
16
|
+
|
17
|
+
mattr_accessor :username, :password
|
10
18
|
|
11
19
|
autoload :Base, "rawler/base"
|
12
20
|
autoload :Crawler, "rawler/crawler"
|
21
|
+
autoload :Request, "rawler/request"
|
13
22
|
end
|
data/lib/rawler/base.rb
CHANGED
@@ -2,16 +2,19 @@ module Rawler
|
|
2
2
|
|
3
3
|
class Base
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :responses
|
6
6
|
|
7
|
-
def initialize(url, output)
|
8
|
-
@url = url
|
7
|
+
def initialize(url, output, username=nil, password=nil)
|
9
8
|
@responses = {}
|
10
|
-
|
9
|
+
|
10
|
+
Rawler.url = url
|
11
|
+
Rawler.output = output
|
12
|
+
Rawler.username = username
|
13
|
+
Rawler.password = password
|
11
14
|
end
|
12
15
|
|
13
16
|
def validate
|
14
|
-
validate_links_in_page(url)
|
17
|
+
validate_links_in_page(Rawler.url)
|
15
18
|
end
|
16
19
|
|
17
20
|
private
|
@@ -30,32 +33,29 @@ module Rawler
|
|
30
33
|
end
|
31
34
|
|
32
35
|
def add_status_code(link)
|
33
|
-
|
34
|
-
|
35
|
-
response = nil
|
36
|
-
|
37
|
-
Net::HTTP.start(uri.host, uri.port) do |http|
|
38
|
-
path = (uri.path.size == 0) ? "/" : uri.path
|
39
|
-
response = http.head(path, {'User-Agent'=>'Rawler'})
|
40
|
-
end
|
36
|
+
response = Rawler::Request.get(link)
|
41
37
|
|
42
|
-
|
38
|
+
write("#{response.code} - #{link}")
|
43
39
|
responses[link] = { :status => response.code.to_i }
|
44
40
|
rescue Errno::ECONNREFUSED
|
45
|
-
|
41
|
+
write("Connection refused - '#{link}'")
|
46
42
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
47
43
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
48
|
-
|
44
|
+
write("Connection problems - '#{link}'")
|
49
45
|
end
|
50
46
|
|
51
47
|
def same_domain?(link)
|
52
|
-
URI.parse(url).host == URI.parse(link).host
|
48
|
+
URI.parse(Rawler.url).host == URI.parse(link).host
|
53
49
|
end
|
54
50
|
|
55
51
|
def not_yet_parsed?(link)
|
56
52
|
responses[link].nil?
|
57
53
|
end
|
58
54
|
|
55
|
+
def write(message)
|
56
|
+
Rawler.output.puts(message)
|
57
|
+
end
|
58
|
+
|
59
59
|
end
|
60
60
|
|
61
61
|
end
|
data/lib/rawler/crawler.rb
CHANGED
@@ -9,12 +9,16 @@ module Rawler
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def links
|
12
|
-
|
12
|
+
if different_domain?(url, Rawler.url) || not_html?(url)
|
13
|
+
return []
|
14
|
+
end
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
response = Rawler::Request.get(url)
|
17
|
+
|
18
|
+
doc = Nokogiri::HTML(response.body)
|
19
|
+
doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
|
16
20
|
rescue Errno::ECONNREFUSED
|
17
|
-
|
21
|
+
write("Couldn't connect to #{url}")
|
18
22
|
[]
|
19
23
|
end
|
20
24
|
|
@@ -23,6 +27,24 @@ module Rawler
|
|
23
27
|
def absolute_url(path)
|
24
28
|
URI.parse(url).merge(path.to_s).to_s
|
25
29
|
end
|
30
|
+
|
31
|
+
def write(message)
|
32
|
+
Rawler.output.puts(message)
|
33
|
+
end
|
34
|
+
|
35
|
+
def different_domain?(url_1, url_2)
|
36
|
+
URI.parse(url_1).host != URI.parse(url_2).host
|
37
|
+
end
|
38
|
+
|
39
|
+
def not_html?(url)
|
40
|
+
Rawler::Request.head(url).content_type != 'text/html'
|
41
|
+
end
|
42
|
+
|
43
|
+
def valid_url?(url)
|
44
|
+
scheme = URI.parse(url).scheme
|
45
|
+
|
46
|
+
['http', 'https'].include?(scheme)
|
47
|
+
end
|
26
48
|
|
27
49
|
end
|
28
50
|
|
data/spec/spec_helper.rb
CHANGED
@@ -5,6 +5,6 @@ require 'fakeweb'
|
|
5
5
|
|
6
6
|
FakeWeb.allow_net_connect = false
|
7
7
|
|
8
|
-
def register(uri, content, status=200)
|
9
|
-
FakeWeb.register_uri(:any, uri, :body => content, :status => status)
|
8
|
+
def register(uri, content, status=200, options={})
|
9
|
+
FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
|
10
10
|
end
|
data/spec/unit/base_spec.rb
CHANGED
@@ -72,6 +72,36 @@ describe Rawler::Base do
|
|
72
72
|
rawler.responses[url][:status].should == 302
|
73
73
|
end
|
74
74
|
|
75
|
+
it "should save username and password" do
|
76
|
+
rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
|
77
|
+
|
78
|
+
Rawler.username.should == 'my_user'
|
79
|
+
Rawler.password.should == 'secret'
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should rescue from Errno::ECONNREFUSED" do
|
83
|
+
url = 'http://example.com'
|
84
|
+
|
85
|
+
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
86
|
+
|
87
|
+
output.should_receive(:puts).with("Connection refused - '#{url}'")
|
88
|
+
|
89
|
+
rawler.send(:add_status_code, url)
|
90
|
+
end
|
91
|
+
|
92
|
+
[Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
93
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
|
94
|
+
it "should rescue from #{error}" do
|
95
|
+
url = 'http://example.com'
|
96
|
+
|
97
|
+
Rawler::Request.should_receive(:get).and_raise error
|
98
|
+
|
99
|
+
output.should_receive(:puts).with("Connection problems - '#{url}'")
|
100
|
+
|
101
|
+
rawler.send(:add_status_code, url)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
75
105
|
end
|
76
106
|
|
77
107
|
|
data/spec/unit/crawler_spec.rb
CHANGED
@@ -2,40 +2,96 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
|
|
2
2
|
|
3
3
|
describe Rawler::Crawler do
|
4
4
|
|
5
|
+
let(:url) { 'http://example.com' }
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
Rawler.stub!(:url).and_return(url)
|
9
|
+
end
|
10
|
+
|
5
11
|
it "should parse all links" do
|
6
|
-
url = 'http://example.com/'
|
7
12
|
register(url, site)
|
8
13
|
|
9
14
|
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
|
10
15
|
end
|
11
16
|
|
17
|
+
it "should parse relative links" do
|
18
|
+
url = 'http://example.com/path'
|
19
|
+
register(url, '<a href="/foo">foo</a>')
|
20
|
+
|
21
|
+
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should parse links only if the page is in the same domain as the main url" do
|
25
|
+
url = 'http://external.com/path'
|
26
|
+
register(url, '<a href="/foo">foo</a>')
|
27
|
+
|
28
|
+
Rawler.should_receive(:url).and_return('http://example.com')
|
29
|
+
|
30
|
+
Rawler::Crawler.new(url).links.should == []
|
31
|
+
end
|
32
|
+
|
12
33
|
it "should return an empty array when raising Errno::ECONNREFUSED" do
|
13
|
-
url = 'http://example.com'
|
14
34
|
register(url, site)
|
35
|
+
crawler = Rawler::Crawler.new(url)
|
15
36
|
|
16
|
-
|
37
|
+
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
17
38
|
|
18
|
-
crawler
|
39
|
+
crawler.links.should == []
|
19
40
|
end
|
20
41
|
|
21
|
-
it "should
|
22
|
-
|
23
|
-
register(url,
|
42
|
+
it "should print a message when raising Errno::ECONNREFUSED" do
|
43
|
+
output = double('output')
|
44
|
+
register(url, site)
|
45
|
+
|
46
|
+
crawler = Rawler::Crawler.new(url)
|
47
|
+
|
48
|
+
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
49
|
+
Rawler.should_receive(:output).and_return(output)
|
50
|
+
output.should_receive(:puts).with("Couldn't connect to #{url}")
|
51
|
+
|
52
|
+
crawler.links
|
53
|
+
end
|
54
|
+
|
55
|
+
context "should ignore content type other than text/html" do
|
56
|
+
|
57
|
+
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
58
|
+
|
59
|
+
it "should ignore '#{content_type}'" do
|
60
|
+
register(url, site, 200, :content_type => content_type)
|
61
|
+
|
62
|
+
crawler = Rawler::Crawler.new(url)
|
63
|
+
crawler.links.should == []
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
24
67
|
|
25
|
-
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
26
68
|
end
|
27
69
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
70
|
+
it "should ignore links other than http or https" do
|
71
|
+
content = <<-content
|
72
|
+
<a href="http://example.com/valid">foo</a>
|
73
|
+
<a href="mailto:info@example.com">invalid</a>
|
74
|
+
<a href="https://foo.com">valid</a>
|
75
|
+
content
|
76
|
+
|
77
|
+
register(url, content)
|
78
|
+
|
79
|
+
crawler = Rawler::Crawler.new(url)
|
80
|
+
crawler.links.should == ['http://example.com/valid', 'https://foo.com']
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should crawl http basic pages" do
|
84
|
+
content = '<a href="http://example.com/secret-path">foo</a>'
|
85
|
+
|
86
|
+
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
87
|
+
register('http://foo:bar@example.com/secret', content)
|
88
|
+
|
89
|
+
Rawler.stub!(:username).and_return('foo')
|
90
|
+
Rawler.stub!(:password).and_return('bar')
|
91
|
+
|
92
|
+
crawler = Rawler::Crawler.new('http://example.com/secret')
|
93
|
+
crawler.links.should == ['http://example.com/secret-path']
|
94
|
+
end
|
39
95
|
|
40
96
|
private
|
41
97
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-11 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -48,7 +48,10 @@ dependencies:
|
|
48
48
|
version: 2.8.0
|
49
49
|
type: :development
|
50
50
|
version_requirements: *id002
|
51
|
-
description:
|
51
|
+
description: |-
|
52
|
+
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
53
|
+
|
54
|
+
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
52
55
|
email:
|
53
56
|
- info@oscardelben.com
|
54
57
|
executables:
|
@@ -107,6 +110,6 @@ rubyforge_project: oscardelben
|
|
107
110
|
rubygems_version: 1.4.1
|
108
111
|
signing_key:
|
109
112
|
specification_version: 3
|
110
|
-
summary: Rawler is a Ruby library that crawls your website and
|
113
|
+
summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
|
111
114
|
test_files: []
|
112
115
|
|