rawler 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +10 -5
- data/bin/rawler +25 -3
- data/lib/rawler.rb +10 -1
- data/lib/rawler/base.rb +17 -17
- data/lib/rawler/crawler.rb +26 -4
- data/spec/spec_helper.rb +2 -2
- data/spec/unit/base_spec.rb +30 -0
- data/spec/unit/crawler_spec.rb +75 -19
- metadata +9 -6
data/README.txt
CHANGED
@@ -4,11 +4,19 @@
|
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
Rawler is a Ruby library that crawls your website and
|
7
|
+
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
8
|
+
|
9
|
+
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
8
10
|
|
9
11
|
== SYNOPSIS:
|
10
12
|
|
11
|
-
rawler http://example.com
|
13
|
+
rawler http://example.com [options]
|
14
|
+
|
15
|
+
where [options] are:
|
16
|
+
--username, -u <s>: HTT Basic Username
|
17
|
+
--password, -p <s>: HTT Basic Password
|
18
|
+
--version, -v: Print version and exit
|
19
|
+
--help, -h: Show this message
|
12
20
|
|
13
21
|
== INSTALL:
|
14
22
|
|
@@ -16,10 +24,7 @@ gem install rawler
|
|
16
24
|
|
17
25
|
== TODO
|
18
26
|
|
19
|
-
* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
|
20
27
|
* Export to html
|
21
|
-
* Handle multiple urls at once
|
22
|
-
* Add user agent
|
23
28
|
|
24
29
|
== LICENSE:
|
25
30
|
|
data/bin/rawler
CHANGED
@@ -1,11 +1,33 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
|
4
|
+
require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
|
4
5
|
|
5
|
-
|
6
|
+
opts = Trollop::options do
|
7
|
+
version "rawler 0.0.3 (c) 2011 Oscar Del Ben"
|
8
|
+
banner <<-EOS
|
9
|
+
Rawler is a command line utility for parsing links on a website
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
rawler http://example.com [options]
|
13
|
+
|
14
|
+
where [options] are:
|
15
|
+
EOS
|
16
|
+
|
17
|
+
# opt :domain, "domain that you want to test", :type => :string
|
18
|
+
opt :username, "HTT Basic Username", :type => :string
|
19
|
+
opt :password, "HTT Basic Password", :type => :string
|
20
|
+
end
|
21
|
+
|
22
|
+
domain = ARGV.shift
|
6
23
|
|
7
24
|
if domain.nil?
|
8
|
-
|
25
|
+
Trollop::die "Domain name is mandatory. Type --help for help"
|
26
|
+
else
|
27
|
+
Trollop::options do
|
28
|
+
opt :domain, "Domain address", :type => :string
|
29
|
+
end
|
9
30
|
end
|
10
31
|
|
11
|
-
Rawler::Base.new(domain, $stdout).validate
|
32
|
+
Rawler::Base.new(domain, $stdout, opts[:username], opts[:password]).validate
|
33
|
+
|
data/lib/rawler.rb
CHANGED
@@ -1,13 +1,22 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'net/http'
|
3
|
+
require 'net/https'
|
3
4
|
require 'nokogiri'
|
4
5
|
|
5
6
|
$:.unshift(File.dirname(__FILE__)) unless
|
6
7
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
7
8
|
|
9
|
+
require 'rawler/core_extensions'
|
10
|
+
|
8
11
|
module Rawler
|
9
|
-
VERSION = '0.0.
|
12
|
+
VERSION = '0.0.3'
|
13
|
+
|
14
|
+
mattr_accessor :output
|
15
|
+
mattr_accessor :url
|
16
|
+
|
17
|
+
mattr_accessor :username, :password
|
10
18
|
|
11
19
|
autoload :Base, "rawler/base"
|
12
20
|
autoload :Crawler, "rawler/crawler"
|
21
|
+
autoload :Request, "rawler/request"
|
13
22
|
end
|
data/lib/rawler/base.rb
CHANGED
@@ -2,16 +2,19 @@ module Rawler
|
|
2
2
|
|
3
3
|
class Base
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :responses
|
6
6
|
|
7
|
-
def initialize(url, output)
|
8
|
-
@url = url
|
7
|
+
def initialize(url, output, username=nil, password=nil)
|
9
8
|
@responses = {}
|
10
|
-
|
9
|
+
|
10
|
+
Rawler.url = url
|
11
|
+
Rawler.output = output
|
12
|
+
Rawler.username = username
|
13
|
+
Rawler.password = password
|
11
14
|
end
|
12
15
|
|
13
16
|
def validate
|
14
|
-
validate_links_in_page(url)
|
17
|
+
validate_links_in_page(Rawler.url)
|
15
18
|
end
|
16
19
|
|
17
20
|
private
|
@@ -30,32 +33,29 @@ module Rawler
|
|
30
33
|
end
|
31
34
|
|
32
35
|
def add_status_code(link)
|
33
|
-
|
34
|
-
|
35
|
-
response = nil
|
36
|
-
|
37
|
-
Net::HTTP.start(uri.host, uri.port) do |http|
|
38
|
-
path = (uri.path.size == 0) ? "/" : uri.path
|
39
|
-
response = http.head(path, {'User-Agent'=>'Rawler'})
|
40
|
-
end
|
36
|
+
response = Rawler::Request.get(link)
|
41
37
|
|
42
|
-
|
38
|
+
write("#{response.code} - #{link}")
|
43
39
|
responses[link] = { :status => response.code.to_i }
|
44
40
|
rescue Errno::ECONNREFUSED
|
45
|
-
|
41
|
+
write("Connection refused - '#{link}'")
|
46
42
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
47
43
|
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
48
|
-
|
44
|
+
write("Connection problems - '#{link}'")
|
49
45
|
end
|
50
46
|
|
51
47
|
def same_domain?(link)
|
52
|
-
URI.parse(url).host == URI.parse(link).host
|
48
|
+
URI.parse(Rawler.url).host == URI.parse(link).host
|
53
49
|
end
|
54
50
|
|
55
51
|
def not_yet_parsed?(link)
|
56
52
|
responses[link].nil?
|
57
53
|
end
|
58
54
|
|
55
|
+
def write(message)
|
56
|
+
Rawler.output.puts(message)
|
57
|
+
end
|
58
|
+
|
59
59
|
end
|
60
60
|
|
61
61
|
end
|
data/lib/rawler/crawler.rb
CHANGED
@@ -9,12 +9,16 @@ module Rawler
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def links
|
12
|
-
|
12
|
+
if different_domain?(url, Rawler.url) || not_html?(url)
|
13
|
+
return []
|
14
|
+
end
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
response = Rawler::Request.get(url)
|
17
|
+
|
18
|
+
doc = Nokogiri::HTML(response.body)
|
19
|
+
doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
|
16
20
|
rescue Errno::ECONNREFUSED
|
17
|
-
|
21
|
+
write("Couldn't connect to #{url}")
|
18
22
|
[]
|
19
23
|
end
|
20
24
|
|
@@ -23,6 +27,24 @@ module Rawler
|
|
23
27
|
def absolute_url(path)
|
24
28
|
URI.parse(url).merge(path.to_s).to_s
|
25
29
|
end
|
30
|
+
|
31
|
+
def write(message)
|
32
|
+
Rawler.output.puts(message)
|
33
|
+
end
|
34
|
+
|
35
|
+
def different_domain?(url_1, url_2)
|
36
|
+
URI.parse(url_1).host != URI.parse(url_2).host
|
37
|
+
end
|
38
|
+
|
39
|
+
def not_html?(url)
|
40
|
+
Rawler::Request.head(url).content_type != 'text/html'
|
41
|
+
end
|
42
|
+
|
43
|
+
def valid_url?(url)
|
44
|
+
scheme = URI.parse(url).scheme
|
45
|
+
|
46
|
+
['http', 'https'].include?(scheme)
|
47
|
+
end
|
26
48
|
|
27
49
|
end
|
28
50
|
|
data/spec/spec_helper.rb
CHANGED
@@ -5,6 +5,6 @@ require 'fakeweb'
|
|
5
5
|
|
6
6
|
FakeWeb.allow_net_connect = false
|
7
7
|
|
8
|
-
def register(uri, content, status=200)
|
9
|
-
FakeWeb.register_uri(:any, uri, :body => content, :status => status)
|
8
|
+
def register(uri, content, status=200, options={})
|
9
|
+
FakeWeb.register_uri(:any, uri, { :body => content, :status => status, :content_type => 'text/html' }.merge(options))
|
10
10
|
end
|
data/spec/unit/base_spec.rb
CHANGED
@@ -72,6 +72,36 @@ describe Rawler::Base do
|
|
72
72
|
rawler.responses[url][:status].should == 302
|
73
73
|
end
|
74
74
|
|
75
|
+
it "should save username and password" do
|
76
|
+
rawler = Rawler::Base.new('http://example.com', output, 'my_user', 'secret')
|
77
|
+
|
78
|
+
Rawler.username.should == 'my_user'
|
79
|
+
Rawler.password.should == 'secret'
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should rescue from Errno::ECONNREFUSED" do
|
83
|
+
url = 'http://example.com'
|
84
|
+
|
85
|
+
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
86
|
+
|
87
|
+
output.should_receive(:puts).with("Connection refused - '#{url}'")
|
88
|
+
|
89
|
+
rawler.send(:add_status_code, url)
|
90
|
+
end
|
91
|
+
|
92
|
+
[Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
93
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError].each do |error|
|
94
|
+
it "should rescue from #{error}" do
|
95
|
+
url = 'http://example.com'
|
96
|
+
|
97
|
+
Rawler::Request.should_receive(:get).and_raise error
|
98
|
+
|
99
|
+
output.should_receive(:puts).with("Connection problems - '#{url}'")
|
100
|
+
|
101
|
+
rawler.send(:add_status_code, url)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
75
105
|
end
|
76
106
|
|
77
107
|
|
data/spec/unit/crawler_spec.rb
CHANGED
@@ -2,40 +2,96 @@ require File.dirname(__FILE__) + '/../spec_helper.rb'
|
|
2
2
|
|
3
3
|
describe Rawler::Crawler do
|
4
4
|
|
5
|
+
let(:url) { 'http://example.com' }
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
Rawler.stub!(:url).and_return(url)
|
9
|
+
end
|
10
|
+
|
5
11
|
it "should parse all links" do
|
6
|
-
url = 'http://example.com/'
|
7
12
|
register(url, site)
|
8
13
|
|
9
14
|
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
|
10
15
|
end
|
11
16
|
|
17
|
+
it "should parse relative links" do
|
18
|
+
url = 'http://example.com/path'
|
19
|
+
register(url, '<a href="/foo">foo</a>')
|
20
|
+
|
21
|
+
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should parse links only if the page is in the same domain as the main url" do
|
25
|
+
url = 'http://external.com/path'
|
26
|
+
register(url, '<a href="/foo">foo</a>')
|
27
|
+
|
28
|
+
Rawler.should_receive(:url).and_return('http://example.com')
|
29
|
+
|
30
|
+
Rawler::Crawler.new(url).links.should == []
|
31
|
+
end
|
32
|
+
|
12
33
|
it "should return an empty array when raising Errno::ECONNREFUSED" do
|
13
|
-
url = 'http://example.com'
|
14
34
|
register(url, site)
|
35
|
+
crawler = Rawler::Crawler.new(url)
|
15
36
|
|
16
|
-
|
37
|
+
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
17
38
|
|
18
|
-
crawler
|
39
|
+
crawler.links.should == []
|
19
40
|
end
|
20
41
|
|
21
|
-
it "should
|
22
|
-
|
23
|
-
register(url,
|
42
|
+
it "should print a message when raising Errno::ECONNREFUSED" do
|
43
|
+
output = double('output')
|
44
|
+
register(url, site)
|
45
|
+
|
46
|
+
crawler = Rawler::Crawler.new(url)
|
47
|
+
|
48
|
+
Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
|
49
|
+
Rawler.should_receive(:output).and_return(output)
|
50
|
+
output.should_receive(:puts).with("Couldn't connect to #{url}")
|
51
|
+
|
52
|
+
crawler.links
|
53
|
+
end
|
54
|
+
|
55
|
+
context "should ignore content type other than text/html" do
|
56
|
+
|
57
|
+
['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
|
58
|
+
|
59
|
+
it "should ignore '#{content_type}'" do
|
60
|
+
register(url, site, 200, :content_type => content_type)
|
61
|
+
|
62
|
+
crawler = Rawler::Crawler.new(url)
|
63
|
+
crawler.links.should == []
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
24
67
|
|
25
|
-
Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
|
26
68
|
end
|
27
69
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
70
|
+
it "should ignore links other than http or https" do
|
71
|
+
content = <<-content
|
72
|
+
<a href="http://example.com/valid">foo</a>
|
73
|
+
<a href="mailto:info@example.com">invalid</a>
|
74
|
+
<a href="https://foo.com">valid</a>
|
75
|
+
content
|
76
|
+
|
77
|
+
register(url, content)
|
78
|
+
|
79
|
+
crawler = Rawler::Crawler.new(url)
|
80
|
+
crawler.links.should == ['http://example.com/valid', 'https://foo.com']
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should crawl http basic pages" do
|
84
|
+
content = '<a href="http://example.com/secret-path">foo</a>'
|
85
|
+
|
86
|
+
register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
|
87
|
+
register('http://foo:bar@example.com/secret', content)
|
88
|
+
|
89
|
+
Rawler.stub!(:username).and_return('foo')
|
90
|
+
Rawler.stub!(:password).and_return('bar')
|
91
|
+
|
92
|
+
crawler = Rawler::Crawler.new('http://example.com/secret')
|
93
|
+
crawler.links.should == ['http://example.com/secret-path']
|
94
|
+
end
|
39
95
|
|
40
96
|
private
|
41
97
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-11 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -48,7 +48,10 @@ dependencies:
|
|
48
48
|
version: 2.8.0
|
49
49
|
type: :development
|
50
50
|
version_requirements: *id002
|
51
|
-
description:
|
51
|
+
description: |-
|
52
|
+
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
53
|
+
|
54
|
+
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
52
55
|
email:
|
53
56
|
- info@oscardelben.com
|
54
57
|
executables:
|
@@ -107,6 +110,6 @@ rubyforge_project: oscardelben
|
|
107
110
|
rubygems_version: 1.4.1
|
108
111
|
signing_key:
|
109
112
|
specification_version: 3
|
110
|
-
summary: Rawler is a Ruby library that crawls your website and
|
113
|
+
summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
|
111
114
|
test_files: []
|
112
115
|
|