rawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2011-01-10
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
data/Manifest.txt ADDED
@@ -0,0 +1,13 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/rawler
6
+ lib/rawler/base.rb
7
+ lib/rawler/crawler.rb
8
+ lib/rawler.rb
9
+ spec/spec.opts
10
+ spec/spec_helper.rb
11
+ spec/unit/base_spec.rb
12
+ spec/unit/crawler_spec.rb
13
+ tasks/rspec.rake
data/README.txt ADDED
@@ -0,0 +1,47 @@
1
+ = rawler
2
+
3
+ * http://github.com/#{github_username}/#{project_name}
4
+
5
+ == DESCRIPTION:
6
+
7
+ Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
8
+
9
+ == SYNOPSIS:
10
+
11
+ rawler http://example.com
12
+
13
+ == INSTALL:
14
+
15
+ gem install rawler
16
+
17
+ == TODO
18
+
19
+ * Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
20
+ * Export to html
21
+ * Handle multiple urls at once
22
+ * Add user agent
23
+
24
+ == LICENSE:
25
+
26
+ (The MIT License)
27
+
28
+ Copyright (c) 2011 Oscar Del Ben
29
+
30
+ Permission is hereby granted, free of charge, to any person obtaining
31
+ a copy of this software and associated documentation files (the
32
+ 'Software'), to deal in the Software without restriction, including
33
+ without limitation the rights to use, copy, modify, merge, publish,
34
+ distribute, sublicense, and/or sell copies of the Software, and to
35
+ permit persons to whom the Software is furnished to do so, subject to
36
+ the following conditions:
37
+
38
+ The above copyright notice and this permission notice shall be
39
+ included in all copies or substantial portions of the Software.
40
+
41
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
42
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
43
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
44
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
45
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
46
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
47
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+
6
+ # Hoe.plugin :compiler
7
+ # Hoe.plugin :gem_prelude_sucks
8
+ # Hoe.plugin :inline
9
+ # Hoe.plugin :racc
10
+ # Hoe.plugin :rubyforge
11
+
12
+ Hoe.spec 'rawler' do
13
+ # HEY! If you fill these out in ~/.hoe_template/Rakefile.erb then
14
+ # you'll never have to touch them again!
15
+ # (delete this comment too, of course)
16
+
17
+ developer('Oscar Del Ben', 'info@oscardelben.com')
18
+
19
+ self.rubyforge_name = 'oscardelben'
20
+
21
+ extra_deps << ['nokogiri']
22
+ end
23
+
24
+ # vim: syntax=ruby
data/bin/rawler ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
4
+
5
+ domain = ARGV[0]
6
+
7
+ if domain.nil?
8
+ puts "Usage: rawler http://example.com"
9
+ end
10
+
11
+ Rawler::Base.new(domain, $stdout).validate
data/lib/rawler.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'net/http'
3
+ require 'nokogiri'
4
+
5
+ $:.unshift(File.dirname(__FILE__)) unless
6
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
7
+
8
+ module Rawler
9
+ VERSION = '0.0.1'
10
+
11
+ autoload :Base, "rawler/base"
12
+ autoload :Crawler, "rawler/crawler"
13
+ end
@@ -0,0 +1,61 @@
1
+ module Rawler
2
+
3
+ class Base
4
+
5
+ attr_accessor :url, :responses
6
+
7
+ def initialize(url, output)
8
+ @url = url
9
+ @responses = {}
10
+ $output = output
11
+ end
12
+
13
+ def validate
14
+ validate_links_in_page(url)
15
+ end
16
+
17
+ private
18
+
19
+ def validate_links_in_page(current_url)
20
+ Rawler::Crawler.new(current_url).links.each do |page_url|
21
+ validate_page(page_url)
22
+ end
23
+ end
24
+
25
+ def validate_page(page_url)
26
+ if not_yet_parsed?(page_url)
27
+ add_status_code(page_url)
28
+ validate_links_in_page(page_url) if same_domain?(page_url)
29
+ end
30
+ end
31
+
32
+ def add_status_code(link)
33
+ uri = URI.parse(link)
34
+
35
+ response = nil
36
+
37
+ Net::HTTP.start(uri.host, uri.port) do |http|
38
+ path = (uri.path.size == 0) ? "/" : uri.path
39
+ response = http.head(path, {'User-Agent'=>'Rawler'})
40
+ end
41
+
42
+ $output.puts("#{response.code} - #{link}")
43
+ responses[link] = { :status => response.code.to_i }
44
+ rescue Errno::ECONNREFUSED
45
+ puts "Connection refused - '#{link}'"
46
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
47
+ Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
48
+ puts "Connection problems - #{link}"
49
+ end
50
+
51
+ def same_domain?(link)
52
+ URI.parse(url).host == URI.parse(link).host
53
+ end
54
+
55
+ def not_yet_parsed?(link)
56
+ responses[link].nil?
57
+ end
58
+
59
+ end
60
+
61
+ end
@@ -0,0 +1,23 @@
1
+ module Rawler
2
+
3
+ class Crawler
4
+
5
+ attr_accessor :url, :links
6
+
7
+ def initialize(url)
8
+ @url = url
9
+ end
10
+
11
+ def links
12
+ content = Net::HTTP.get(URI.parse(url))
13
+
14
+ doc = Nokogiri::HTML(content)
15
+ doc.css('a').map { |a| a['href'] }
16
+ rescue Errno::ECONNREFUSED
17
+ $output.puts "Couldn't connect to #{url}"
18
+ []
19
+ end
20
+
21
+ end
22
+
23
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour
@@ -0,0 +1,10 @@
1
+
2
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
3
+ require 'rawler'
4
+ require 'fakeweb'
5
+
6
+ FakeWeb.allow_net_connect = false
7
+
8
+ def register(uri, content, status=200)
9
+ FakeWeb.register_uri(:any, uri, :body => content, :status => status)
10
+ end
@@ -0,0 +1,93 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper.rb'
2
+
3
+ describe Rawler::Base do
4
+
5
+ let(:output) { double('output').as_null_object }
6
+ let(:rawler) { Rawler::Base.new('http://example.com', output) }
7
+
8
+ before(:each) do
9
+ register('http://example.com', site)
10
+ end
11
+
12
+ describe "validate_links" do
13
+
14
+ it "should validate links recursively" do
15
+ register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
16
+ register('http://example.com/foo2', '')
17
+ register('http://external.com', '')
18
+ register('http://external.com/foo', '')
19
+
20
+ rawler.validate
21
+
22
+ rawler.responses['http://example.com/foo1'].should_not be_nil
23
+ rawler.responses['http://example.com/foo2'].should_not be_nil
24
+ rawler.responses['http://external.com'].should_not be_nil
25
+ rawler.responses['http://external.com/foo'].should_not be_nil
26
+ end
27
+
28
+ it "should not validate links on external pages" do
29
+ register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
30
+ register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
31
+ register('http://external.com/bar', '')
32
+
33
+ rawler.validate
34
+
35
+ rawler.responses['http://external.com/foo'].should_not be_nil
36
+ rawler.responses['http://external.com/bar'].should be_nil
37
+ end
38
+
39
+ it "should output results" do
40
+ register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
41
+ register('http://example.com/foo2', '')
42
+ register('http://external.com', '')
43
+ register('http://external.com/foo', '', 302)
44
+
45
+ output.should_receive(:puts).with('200 - http://example.com/foo1')
46
+ output.should_receive(:puts).with('200 - http://example.com/foo2')
47
+ output.should_receive(:puts).with('200 - http://external.com')
48
+ output.should_receive(:puts).with('302 - http://external.com/foo')
49
+
50
+ rawler.validate
51
+ end
52
+
53
+ end
54
+
55
+ describe "get_status_code" do
56
+
57
+ it "should add to 200 links" do
58
+ url = 'http://example.com/foo'
59
+ register(url, '', 200)
60
+
61
+ rawler.send(:add_status_code, url)
62
+
63
+ rawler.responses[url][:status].should == 200
64
+ end
65
+
66
+ it "should add to 302 links" do
67
+ url = 'http://example.com/foo'
68
+ register(url, '', 302)
69
+
70
+ rawler.send(:add_status_code, url)
71
+
72
+ rawler.responses[url][:status].should == 302
73
+ end
74
+
75
+ end
76
+
77
+
78
+ private
79
+
80
+ def site
81
+ <<-site
82
+ <html>
83
+ <body>
84
+ <a href="http://example.com/foo1">foo1</a>
85
+ <a href="http://example.com/foo2">foo2</a>
86
+
87
+ <a href="http://external.com">external</a>
88
+ </body>
89
+ </html>
90
+ site
91
+ end
92
+
93
+ end
@@ -0,0 +1,30 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper.rb'
2
+
3
+ describe Rawler::Crawler do
4
+
5
+ it "should parse all links" do
6
+ url = 'http://example.com'
7
+ register(url, site)
8
+
9
+ Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
10
+ end
11
+
12
+ private
13
+
14
+ def site
15
+ <<-site
16
+ <!DOCTYPE html>
17
+ <html>
18
+ <body>
19
+ <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
20
+
21
+ <p><a href="http://example.com/foo">foo</a></p>
22
+
23
+ <p><a href="http://external.com/bar">bar</a></p>
24
+
25
+ </body>
26
+ </html>
27
+ site
28
+ end
29
+
30
+ end
data/tasks/rspec.rake ADDED
@@ -0,0 +1,21 @@
1
+ begin
2
+ require 'spec'
3
+ rescue LoadError
4
+ require 'rubygems' unless ENV['NO_RUBYGEMS']
5
+ require 'spec'
6
+ end
7
+ begin
8
+ require 'spec/rake/spectask'
9
+ rescue LoadError
10
+ puts <<-EOS
11
+ To use rspec for testing you must install rspec gem:
12
+ gem install rspec
13
+ EOS
14
+ exit(0)
15
+ end
16
+
17
+ desc "Run the specs under spec/models"
18
+ Spec::Rake::SpecTask.new do |t|
19
+ t.spec_opts = ['--options', "spec/spec.opts"]
20
+ t.spec_files = FileList['spec/**/*_spec.rb']
21
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rawler
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Oscar Del Ben
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-01-10 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: nokogiri
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: hoe
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 47
44
+ segments:
45
+ - 2
46
+ - 8
47
+ - 0
48
+ version: 2.8.0
49
+ type: :development
50
+ version_requirements: *id002
51
+ description: Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
52
+ email:
53
+ - info@oscardelben.com
54
+ executables:
55
+ - rawler
56
+ extensions: []
57
+
58
+ extra_rdoc_files:
59
+ - History.txt
60
+ - Manifest.txt
61
+ - README.txt
62
+ files:
63
+ - History.txt
64
+ - Manifest.txt
65
+ - README.txt
66
+ - Rakefile
67
+ - bin/rawler
68
+ - lib/rawler/base.rb
69
+ - lib/rawler/crawler.rb
70
+ - lib/rawler.rb
71
+ - spec/spec.opts
72
+ - spec/spec_helper.rb
73
+ - spec/unit/base_spec.rb
74
+ - spec/unit/crawler_spec.rb
75
+ - tasks/rspec.rake
76
+ has_rdoc: true
77
+ homepage: http://github.com/#{github_username}/#{project_name}
78
+ licenses: []
79
+
80
+ post_install_message:
81
+ rdoc_options:
82
+ - --main
83
+ - README.txt
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 3
92
+ segments:
93
+ - 0
94
+ version: "0"
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ none: false
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ hash: 3
101
+ segments:
102
+ - 0
103
+ version: "0"
104
+ requirements: []
105
+
106
+ rubyforge_project: oscardelben
107
+ rubygems_version: 1.4.1
108
+ signing_key:
109
+ specification_version: 3
110
+ summary: Rawler is a Ruby library that crawls your website and see the status code of each of your links
111
+ test_files: []
112
+