rawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +13 -0
- data/README.txt +47 -0
- data/Rakefile +24 -0
- data/bin/rawler +11 -0
- data/lib/rawler.rb +13 -0
- data/lib/rawler/base.rb +61 -0
- data/lib/rawler/crawler.rb +23 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/unit/base_spec.rb +93 -0
- data/spec/unit/crawler_spec.rb +30 -0
- data/tasks/rspec.rake +21 -0
- metadata +112 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
= rawler
|
2
|
+
|
3
|
+
* http://github.com/#{github_username}/#{project_name}
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
|
8
|
+
|
9
|
+
== SYNOPSIS:
|
10
|
+
|
11
|
+
rawler http://example.com
|
12
|
+
|
13
|
+
== INSTALL:
|
14
|
+
|
15
|
+
gem install rawler
|
16
|
+
|
17
|
+
== TODO
|
18
|
+
|
19
|
+
* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
|
20
|
+
* Export to html
|
21
|
+
* Handle multiple urls at once
|
22
|
+
* Add user agent
|
23
|
+
|
24
|
+
== LICENSE:
|
25
|
+
|
26
|
+
(The MIT License)
|
27
|
+
|
28
|
+
Copyright (c) 2011 Oscar Del Ben
|
29
|
+
|
30
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
31
|
+
a copy of this software and associated documentation files (the
|
32
|
+
'Software'), to deal in the Software without restriction, including
|
33
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
34
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
35
|
+
permit persons to whom the Software is furnished to do so, subject to
|
36
|
+
the following conditions:
|
37
|
+
|
38
|
+
The above copyright notice and this permission notice shall be
|
39
|
+
included in all copies or substantial portions of the Software.
|
40
|
+
|
41
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
42
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
43
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
44
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
45
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
46
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
47
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
|
6
|
+
# Hoe.plugin :compiler
|
7
|
+
# Hoe.plugin :gem_prelude_sucks
|
8
|
+
# Hoe.plugin :inline
|
9
|
+
# Hoe.plugin :racc
|
10
|
+
# Hoe.plugin :rubyforge
|
11
|
+
|
12
|
+
Hoe.spec 'rawler' do
|
13
|
+
# HEY! If you fill these out in ~/.hoe_template/Rakefile.erb then
|
14
|
+
# you'll never have to touch them again!
|
15
|
+
# (delete this comment too, of course)
|
16
|
+
|
17
|
+
developer('Oscar Del Ben', 'info@oscardelben.com')
|
18
|
+
|
19
|
+
self.rubyforge_name = 'oscardelben'
|
20
|
+
|
21
|
+
extra_deps << ['nokogiri']
|
22
|
+
end
|
23
|
+
|
24
|
+
# vim: syntax=ruby
|
data/bin/rawler
ADDED
data/lib/rawler.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'net/http'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
6
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
7
|
+
|
8
|
+
module Rawler
|
9
|
+
VERSION = '0.0.1'
|
10
|
+
|
11
|
+
autoload :Base, "rawler/base"
|
12
|
+
autoload :Crawler, "rawler/crawler"
|
13
|
+
end
|
data/lib/rawler/base.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Rawler
|
2
|
+
|
3
|
+
class Base
|
4
|
+
|
5
|
+
attr_accessor :url, :responses
|
6
|
+
|
7
|
+
def initialize(url, output)
|
8
|
+
@url = url
|
9
|
+
@responses = {}
|
10
|
+
$output = output
|
11
|
+
end
|
12
|
+
|
13
|
+
def validate
|
14
|
+
validate_links_in_page(url)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def validate_links_in_page(current_url)
|
20
|
+
Rawler::Crawler.new(current_url).links.each do |page_url|
|
21
|
+
validate_page(page_url)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def validate_page(page_url)
|
26
|
+
if not_yet_parsed?(page_url)
|
27
|
+
add_status_code(page_url)
|
28
|
+
validate_links_in_page(page_url) if same_domain?(page_url)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def add_status_code(link)
|
33
|
+
uri = URI.parse(link)
|
34
|
+
|
35
|
+
response = nil
|
36
|
+
|
37
|
+
Net::HTTP.start(uri.host, uri.port) do |http|
|
38
|
+
path = (uri.path.size == 0) ? "/" : uri.path
|
39
|
+
response = http.head(path, {'User-Agent'=>'Rawler'})
|
40
|
+
end
|
41
|
+
|
42
|
+
$output.puts("#{response.code} - #{link}")
|
43
|
+
responses[link] = { :status => response.code.to_i }
|
44
|
+
rescue Errno::ECONNREFUSED
|
45
|
+
puts "Connection refused - '#{link}'"
|
46
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
47
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
48
|
+
puts "Connection problems - #{link}"
|
49
|
+
end
|
50
|
+
|
51
|
+
def same_domain?(link)
|
52
|
+
URI.parse(url).host == URI.parse(link).host
|
53
|
+
end
|
54
|
+
|
55
|
+
def not_yet_parsed?(link)
|
56
|
+
responses[link].nil?
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Rawler
|
2
|
+
|
3
|
+
class Crawler
|
4
|
+
|
5
|
+
attr_accessor :url, :links
|
6
|
+
|
7
|
+
def initialize(url)
|
8
|
+
@url = url
|
9
|
+
end
|
10
|
+
|
11
|
+
def links
|
12
|
+
content = Net::HTTP.get(URI.parse(url))
|
13
|
+
|
14
|
+
doc = Nokogiri::HTML(content)
|
15
|
+
doc.css('a').map { |a| a['href'] }
|
16
|
+
rescue Errno::ECONNREFUSED
|
17
|
+
$output.puts "Couldn't connect to #{url}"
|
18
|
+
[]
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Base do
|
4
|
+
|
5
|
+
let(:output) { double('output').as_null_object }
|
6
|
+
let(:rawler) { Rawler::Base.new('http://example.com', output) }
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
register('http://example.com', site)
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "validate_links" do
|
13
|
+
|
14
|
+
it "should validate links recursively" do
|
15
|
+
register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
|
16
|
+
register('http://example.com/foo2', '')
|
17
|
+
register('http://external.com', '')
|
18
|
+
register('http://external.com/foo', '')
|
19
|
+
|
20
|
+
rawler.validate
|
21
|
+
|
22
|
+
rawler.responses['http://example.com/foo1'].should_not be_nil
|
23
|
+
rawler.responses['http://example.com/foo2'].should_not be_nil
|
24
|
+
rawler.responses['http://external.com'].should_not be_nil
|
25
|
+
rawler.responses['http://external.com/foo'].should_not be_nil
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should not validate links on external pages" do
|
29
|
+
register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
|
30
|
+
register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
|
31
|
+
register('http://external.com/bar', '')
|
32
|
+
|
33
|
+
rawler.validate
|
34
|
+
|
35
|
+
rawler.responses['http://external.com/foo'].should_not be_nil
|
36
|
+
rawler.responses['http://external.com/bar'].should be_nil
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should output results" do
|
40
|
+
register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
|
41
|
+
register('http://example.com/foo2', '')
|
42
|
+
register('http://external.com', '')
|
43
|
+
register('http://external.com/foo', '', 302)
|
44
|
+
|
45
|
+
output.should_receive(:puts).with('200 - http://example.com/foo1')
|
46
|
+
output.should_receive(:puts).with('200 - http://example.com/foo2')
|
47
|
+
output.should_receive(:puts).with('200 - http://external.com')
|
48
|
+
output.should_receive(:puts).with('302 - http://external.com/foo')
|
49
|
+
|
50
|
+
rawler.validate
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
describe "get_status_code" do
|
56
|
+
|
57
|
+
it "should add to 200 links" do
|
58
|
+
url = 'http://example.com/foo'
|
59
|
+
register(url, '', 200)
|
60
|
+
|
61
|
+
rawler.send(:add_status_code, url)
|
62
|
+
|
63
|
+
rawler.responses[url][:status].should == 200
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should add to 302 links" do
|
67
|
+
url = 'http://example.com/foo'
|
68
|
+
register(url, '', 302)
|
69
|
+
|
70
|
+
rawler.send(:add_status_code, url)
|
71
|
+
|
72
|
+
rawler.responses[url][:status].should == 302
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def site
|
81
|
+
<<-site
|
82
|
+
<html>
|
83
|
+
<body>
|
84
|
+
<a href="http://example.com/foo1">foo1</a>
|
85
|
+
<a href="http://example.com/foo2">foo2</a>
|
86
|
+
|
87
|
+
<a href="http://external.com">external</a>
|
88
|
+
</body>
|
89
|
+
</html>
|
90
|
+
site
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper.rb'
|
2
|
+
|
3
|
+
describe Rawler::Crawler do
|
4
|
+
|
5
|
+
it "should parse all links" do
|
6
|
+
url = 'http://example.com'
|
7
|
+
register(url, site)
|
8
|
+
|
9
|
+
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def site
|
15
|
+
<<-site
|
16
|
+
<!DOCTYPE html>
|
17
|
+
<html>
|
18
|
+
<body>
|
19
|
+
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
|
20
|
+
|
21
|
+
<p><a href="http://example.com/foo">foo</a></p>
|
22
|
+
|
23
|
+
<p><a href="http://external.com/bar">bar</a></p>
|
24
|
+
|
25
|
+
</body>
|
26
|
+
</html>
|
27
|
+
site
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/tasks/rspec.rake
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
5
|
+
require 'spec'
|
6
|
+
end
|
7
|
+
begin
|
8
|
+
require 'spec/rake/spectask'
|
9
|
+
rescue LoadError
|
10
|
+
puts <<-EOS
|
11
|
+
To use rspec for testing you must install rspec gem:
|
12
|
+
gem install rspec
|
13
|
+
EOS
|
14
|
+
exit(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "Run the specs under spec/models"
|
18
|
+
Spec::Rake::SpecTask.new do |t|
|
19
|
+
t.spec_opts = ['--options', "spec/spec.opts"]
|
20
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Oscar Del Ben
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-10 00:00:00 +01:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: nokogiri
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: hoe
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 47
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 8
|
47
|
+
- 0
|
48
|
+
version: 2.8.0
|
49
|
+
type: :development
|
50
|
+
version_requirements: *id002
|
51
|
+
description: Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
|
52
|
+
email:
|
53
|
+
- info@oscardelben.com
|
54
|
+
executables:
|
55
|
+
- rawler
|
56
|
+
extensions: []
|
57
|
+
|
58
|
+
extra_rdoc_files:
|
59
|
+
- History.txt
|
60
|
+
- Manifest.txt
|
61
|
+
- README.txt
|
62
|
+
files:
|
63
|
+
- History.txt
|
64
|
+
- Manifest.txt
|
65
|
+
- README.txt
|
66
|
+
- Rakefile
|
67
|
+
- bin/rawler
|
68
|
+
- lib/rawler/base.rb
|
69
|
+
- lib/rawler/crawler.rb
|
70
|
+
- lib/rawler.rb
|
71
|
+
- spec/spec.opts
|
72
|
+
- spec/spec_helper.rb
|
73
|
+
- spec/unit/base_spec.rb
|
74
|
+
- spec/unit/crawler_spec.rb
|
75
|
+
- tasks/rspec.rake
|
76
|
+
has_rdoc: true
|
77
|
+
homepage: http://github.com/#{github_username}/#{project_name}
|
78
|
+
licenses: []
|
79
|
+
|
80
|
+
post_install_message:
|
81
|
+
rdoc_options:
|
82
|
+
- --main
|
83
|
+
- README.txt
|
84
|
+
require_paths:
|
85
|
+
- lib
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
none: false
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
hash: 3
|
101
|
+
segments:
|
102
|
+
- 0
|
103
|
+
version: "0"
|
104
|
+
requirements: []
|
105
|
+
|
106
|
+
rubyforge_project: oscardelben
|
107
|
+
rubygems_version: 1.4.1
|
108
|
+
signing_key:
|
109
|
+
specification_version: 3
|
110
|
+
summary: Rawler is a Ruby library that crawls your website and see the status code of each of your links
|
111
|
+
test_files: []
|
112
|
+
|