rawler 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +12 -5
- data/Gemfile.lock +14 -22
- data/LICENSE.txt +20 -0
- data/{README.txt → README.rdoc} +6 -35
- data/Rakefile +58 -24
- data/VERSION +1 -0
- data/bin/rawler +1 -1
- data/lib/rawler.rb +52 -2
- data/lib/rawler/base.rb +45 -5
- data/lib/rawler/core_extensions.rb +3 -1
- data/lib/rawler/core_extensions/module.rb +3 -1
- data/lib/rawler/crawler.rb +28 -4
- data/lib/rawler/request.rb +3 -1
- data/spec/lib/rawler_spec.rb +12 -0
- data/test/helper.rb +18 -0
- data/test/test_rawler.rb +7 -0
- metadata +68 -38
- data/Manifest.txt +0 -21
- data/specs.watchr +0 -58
- data/tasks/rspec.rake +0 -21
data/Gemfile
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
2
5
|
|
3
|
-
gem
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
gem 'nokogiri'
|
4
9
|
|
5
|
-
|
6
|
-
|
7
|
-
gem "
|
8
|
-
gem "
|
10
|
+
|
11
|
+
group :development do
|
12
|
+
gem "shoulda", ">= 0"
|
13
|
+
gem "bundler", "~> 1.0.0"
|
14
|
+
gem "jeweler", "~> 1.6.4"
|
15
|
+
gem "rcov", ">= 0"
|
9
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,30 +1,22 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
rspec-core (~> 2.4.0)
|
14
|
-
rspec-expectations (~> 2.4.0)
|
15
|
-
rspec-mocks (~> 2.4.0)
|
16
|
-
rspec-core (2.4.0)
|
17
|
-
rspec-expectations (2.4.0)
|
18
|
-
diff-lcs (~> 1.1.2)
|
19
|
-
rspec-mocks (2.4.0)
|
20
|
-
rubyforge (2.0.4)
|
21
|
-
json_pure (>= 1.1.7)
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.6.4)
|
6
|
+
bundler (~> 1.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
nokogiri (1.5.0)
|
10
|
+
rake (0.9.2)
|
11
|
+
rcov (0.9.9)
|
12
|
+
shoulda (2.11.3)
|
22
13
|
|
23
14
|
PLATFORMS
|
24
15
|
ruby
|
25
16
|
|
26
17
|
DEPENDENCIES
|
27
|
-
|
28
|
-
|
29
|
-
nokogiri
|
30
|
-
|
18
|
+
bundler (~> 1.0.0)
|
19
|
+
jeweler (~> 1.6.4)
|
20
|
+
nokogiri
|
21
|
+
rcov
|
22
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Oscar Del Ben
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/{README.txt → README.rdoc}
RENAMED
@@ -1,14 +1,10 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
* http://github.com/oscardelben/rawler
|
4
|
-
|
5
|
-
== DESCRIPTION:
|
1
|
+
### rawler
|
6
2
|
|
7
3
|
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
8
4
|
|
9
5
|
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
10
6
|
|
11
|
-
|
7
|
+
### SYNOPSIS:
|
12
8
|
|
13
9
|
rawler http://example.com [options]
|
14
10
|
|
@@ -19,40 +15,15 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
19
15
|
--version, -v: Print version and exit
|
20
16
|
--help, -h: Show this message
|
21
17
|
|
22
|
-
|
18
|
+
### INSTALL:
|
23
19
|
|
24
20
|
gem install rawler
|
25
21
|
|
26
|
-
|
27
|
-
|
28
|
-
Run bundle install to install everything you need
|
29
|
-
|
30
|
-
rake test
|
31
|
-
|
32
|
-
To package and run the gem locally:
|
33
|
-
|
34
|
-
rake package
|
35
|
-
cd pkg
|
36
|
-
gem install rawler-#{version}.gem
|
37
|
-
|
38
|
-
If you add files, run:
|
39
|
-
|
40
|
-
rake check_manifest
|
41
|
-
|
42
|
-
And add them to the Manifest file.
|
43
|
-
|
44
|
-
== CONTRIBUTORS:
|
45
|
-
|
46
|
-
* bcoob
|
47
|
-
* Hugh Sasse
|
48
|
-
* Ken Egozi
|
49
|
-
* Robert Glaser
|
50
|
-
* Stefan Schüßler
|
51
|
-
* Vesa Vänskä
|
22
|
+
### CONTRIBUTORS:
|
52
23
|
|
53
|
-
See
|
24
|
+
Many. See [https://github.com/oscardelben/rawler/contributors](https://github.com/oscardelben/rawler/contributors)
|
54
25
|
|
55
|
-
|
26
|
+
### LICENSE:
|
56
27
|
|
57
28
|
(The MIT License)
|
58
29
|
|
data/Rakefile
CHANGED
@@ -1,35 +1,69 @@
|
|
1
|
-
#
|
1
|
+
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
|
-
require '
|
4
|
+
require 'bundler'
|
5
|
+
require 'fileutils'
|
5
6
|
|
6
|
-
|
7
|
-
|
7
|
+
begin
|
8
|
+
Bundler.setup(:default, :development)
|
9
|
+
rescue Bundler::BundlerError => e
|
10
|
+
$stderr.puts e.message
|
11
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
+
exit e.status_code
|
13
|
+
end
|
14
|
+
require 'rake'
|
8
15
|
|
9
|
-
require '
|
10
|
-
|
16
|
+
require 'jeweler'
|
17
|
+
Jeweler::Tasks.new do |gem|
|
18
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
19
|
+
gem.name = "rawler"
|
20
|
+
gem.homepage = "http://github.com/oscardelben/rawler"
|
21
|
+
gem.license = "MIT"
|
22
|
+
gem.summary = %Q{Rawler is a tool that crawls the links of your website}
|
23
|
+
gem.description = %Q{Rawler is a tool that crawls the links of your website}
|
24
|
+
gem.email = "info@oscardelben.com"
|
25
|
+
gem.authors = ["Oscar Del Ben"]
|
26
|
+
gem.executables = ['rawler']
|
27
|
+
# dependencies defined in Gemfile
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
11
30
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
17
37
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
38
|
+
require 'rcov/rcovtask'
|
39
|
+
Rcov::RcovTask.new do |test|
|
40
|
+
test.libs << 'test'
|
41
|
+
test.pattern = 'test/**/test_*.rb'
|
42
|
+
test.verbose = true
|
43
|
+
test.rcov_opts << '--exclude "gems/*"'
|
44
|
+
end
|
22
45
|
|
23
|
-
|
46
|
+
task :default => :test
|
24
47
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
end
|
48
|
+
require 'rake/rdoctask'
|
49
|
+
Rake::RDocTask.new do |rdoc|
|
50
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
29
51
|
|
30
|
-
|
31
|
-
|
32
|
-
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "rawler #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
33
56
|
end
|
34
57
|
|
35
|
-
|
58
|
+
desc 'generate docs'
|
59
|
+
task :rocco do
|
60
|
+
#%x!rm -r html/*!
|
61
|
+
|
62
|
+
Dir.chdir "lib"
|
63
|
+
|
64
|
+
files = Dir['**/*.*']
|
65
|
+
|
66
|
+
files.each do |file|
|
67
|
+
%x!rocco #{file} -o ../html!
|
68
|
+
end
|
69
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
data/bin/rawler
CHANGED
@@ -4,7 +4,7 @@ require 'rawler'
|
|
4
4
|
require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
|
5
5
|
|
6
6
|
opts = Trollop::options do
|
7
|
-
version "rawler
|
7
|
+
version "rawler #{Rawler::VERSION} (c) 2011 Oscar Del Ben"
|
8
8
|
banner <<-EOS
|
9
9
|
Rawler is a command line utility for parsing links on a website
|
10
10
|
|
data/lib/rawler.rb
CHANGED
@@ -1,24 +1,74 @@
|
|
1
|
+
# **Rawler** is a command line tool for finding broken links on your website.
|
2
|
+
# You can install Rawler by running:
|
3
|
+
#
|
4
|
+
# gem install rawler
|
5
|
+
#
|
6
|
+
# To use Rawler type:
|
7
|
+
#
|
8
|
+
# rawler example.com
|
9
|
+
#
|
10
|
+
# Type `rawler -h` to see all the available options (including basic auth support).
|
11
|
+
|
12
|
+
#### Prerequisites
|
13
|
+
|
14
|
+
|
1
15
|
require 'rubygems'
|
2
|
-
|
16
|
+
|
17
|
+
# We use [net/https](http://www.ruby-doc.org/stdlib/libdoc/net/http/rdoc/index.html) for making requests.
|
18
|
+
|
3
19
|
require 'net/https'
|
20
|
+
|
21
|
+
# We use [nokogiri](http://nokogiri.org/) for parsing web pages.
|
22
|
+
|
4
23
|
require 'nokogiri'
|
24
|
+
|
25
|
+
# We use the [logger](http://www.ruby-doc.org/stdlib/libdoc/logger/rdoc/) utility for handling the output.
|
26
|
+
|
5
27
|
require 'logger'
|
6
28
|
|
29
|
+
# We require [rawler/core_extensions](rawler/core_extensions.html) which includes some core extensions we need.
|
30
|
+
|
7
31
|
require 'rawler/core_extensions'
|
8
32
|
|
33
|
+
#### The Rawler module
|
34
|
+
|
35
|
+
# The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
|
36
|
+
|
9
37
|
module Rawler
|
10
|
-
VERSION = '0.1.
|
38
|
+
VERSION = '0.1.1'
|
11
39
|
|
40
|
+
# `output` is where we want to direct output. It's set to `$stdout` by default.
|
41
|
+
|
12
42
|
mattr_accessor :output
|
43
|
+
|
44
|
+
# `url` is the url that we want to fetch. We need to keep track of it when parsing other pages to see if they are of the same domain.
|
45
|
+
|
13
46
|
mattr_accessor :url
|
47
|
+
|
48
|
+
# The time we wait between requests, default 3. We don't want to send too many requests to your website!
|
49
|
+
|
14
50
|
mattr_accessor :wait
|
15
51
|
|
52
|
+
# Username and Password for basic auth, if needed.
|
53
|
+
|
16
54
|
mattr_accessor :username, :password
|
17
55
|
|
56
|
+
# Here we autoload when needed the specific namespaces.
|
57
|
+
|
58
|
+
# [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
|
59
|
+
|
18
60
|
autoload :Base, "rawler/base"
|
61
|
+
|
62
|
+
# [Rawler::Crawler](rawler/crawler.html) is responsible for parsing links inside a page.
|
63
|
+
|
19
64
|
autoload :Crawler, "rawler/crawler"
|
65
|
+
|
66
|
+
# [Rawler::Request](rawler/reqeust.html) contains some helper methods for performing requests.
|
67
|
+
|
20
68
|
autoload :Request, "rawler/request"
|
21
69
|
|
70
|
+
# We overwrite url= to automatically add `http://` if needed so that you can simply type `rawler example.com` in the command line.
|
71
|
+
|
22
72
|
def self.url=(url)
|
23
73
|
url.strip!
|
24
74
|
|
data/lib/rawler/base.rb
CHANGED
@@ -1,32 +1,50 @@
|
|
1
|
+
#### Rawler workflow
|
2
|
+
|
3
|
+
# `Rawler::Base` is where all the heavy work is being made.
|
4
|
+
# When you call `rawler somesite.com`, we create an instance of Rawler::Base and then call `validate`, which recursively validates all the links relative to the domain that we specified.
|
5
|
+
|
1
6
|
module Rawler
|
2
7
|
|
3
8
|
class Base
|
4
9
|
|
10
|
+
# `responses` is used to keep track of which links we have already parsed, so that we wont parse them again and again.
|
11
|
+
# TODO: rename `responses` to something more meaningful.
|
12
|
+
|
5
13
|
attr_accessor :responses
|
6
14
|
|
15
|
+
# When we instantiate `Rawler::Base` we set some options according to what you specified on the command line.
|
16
|
+
|
7
17
|
def initialize(url, output, options={})
|
8
18
|
@responses = {}
|
9
19
|
|
10
20
|
Rawler.url = URI.escape(url)
|
21
|
+
output.sync = true
|
11
22
|
Rawler.output = Logger.new(output)
|
12
23
|
Rawler.username = options[:username]
|
13
24
|
Rawler.password = options[:password]
|
14
25
|
Rawler.wait = options[:wait].to_i
|
15
26
|
end
|
16
27
|
|
28
|
+
# The method used to start the real validation process
|
29
|
+
|
17
30
|
def validate
|
18
31
|
validate_links_in_page(Rawler.url)
|
19
32
|
end
|
20
33
|
|
21
34
|
private
|
22
35
|
|
23
|
-
|
24
|
-
|
25
|
-
|
36
|
+
# We ask [Rawler::Crawler](crawler.html) for all the links in page and then validate each of them individually.
|
37
|
+
# We then sleep for the value of `Rawler.wait` (default 3) between each request to avoid dossing your server.
|
38
|
+
|
39
|
+
def validate_links_in_page(page)
|
40
|
+
Rawler::Crawler.new(page).links.each do |page_url|
|
41
|
+
validate_page(page_url, page)
|
26
42
|
sleep(Rawler.wait)
|
27
43
|
end
|
28
44
|
end
|
29
45
|
|
46
|
+
# If we haven't validated the page yet, we check its status code and then validate all the links in the page if it's in the same domain
|
47
|
+
|
30
48
|
def validate_page(page_url, from_url)
|
31
49
|
if not_yet_parsed?(page_url)
|
32
50
|
add_status_code(page_url, from_url)
|
@@ -34,20 +52,33 @@ module Rawler
|
|
34
52
|
end
|
35
53
|
end
|
36
54
|
|
55
|
+
# This is where we check the specific page status.
|
56
|
+
|
37
57
|
def add_status_code(link, from_url)
|
38
58
|
response = Rawler::Request.get(link)
|
39
59
|
|
60
|
+
# We follow a redirect if necessary.
|
61
|
+
|
40
62
|
validate_page(response['Location'], from_url) if response['Location']
|
41
63
|
|
64
|
+
# We inform the user about what we got.
|
65
|
+
|
42
66
|
record_response(response.code, link, from_url, response['Location'])
|
67
|
+
|
68
|
+
# We add the current page to `responses` to avoid parsing it again/
|
69
|
+
|
43
70
|
responses[link] = { :status => response.code.to_i }
|
44
71
|
rescue Errno::ECONNREFUSED
|
45
72
|
error("Connection refused - #{link} - Called from: #{from_url}")
|
46
73
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
47
74
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
|
48
75
|
error("Connection problems - #{link} - Called from: #{from_url}")
|
76
|
+
rescue Exception
|
77
|
+
error("Unknown error - #{link} - Called from: #{from_url}")
|
49
78
|
end
|
50
79
|
|
80
|
+
# Some helper methods
|
81
|
+
|
51
82
|
def same_domain?(link)
|
52
83
|
URI.parse(Rawler.url).host == URI.parse(link).host
|
53
84
|
end
|
@@ -59,21 +90,30 @@ module Rawler
|
|
59
90
|
def error(message)
|
60
91
|
Rawler.output.error(message)
|
61
92
|
end
|
93
|
+
|
94
|
+
# We use this method to inform the user of a page status
|
62
95
|
|
63
96
|
def record_response(code, link, from_url, redirection=nil)
|
97
|
+
|
98
|
+
# By default, we just give the status code and the page url
|
99
|
+
|
64
100
|
message = "#{code} - #{link}"
|
65
101
|
|
102
|
+
# If the status code is more or equal than 300, we also add which url linked the current page
|
103
|
+
|
66
104
|
if code.to_i >= 300
|
67
105
|
message += " - Called from: #{from_url}"
|
68
106
|
end
|
107
|
+
|
108
|
+
# We add information about redirects, if a redirect was set
|
69
109
|
|
70
110
|
message += " - Following redirection to: #{redirection}" if redirection
|
71
111
|
|
112
|
+
# Depending on the status code, we use a different method of logger.
|
113
|
+
|
72
114
|
code = code.to_i
|
73
115
|
case code / 100
|
74
116
|
when 1
|
75
|
-
# TODO: check that if a 100 is received
|
76
|
-
# then there is another status code as well
|
77
117
|
Rawler.output.info(message)
|
78
118
|
when 2 then
|
79
119
|
Rawler.output.info(message)
|
data/lib/rawler/crawler.rb
CHANGED
@@ -1,39 +1,62 @@
|
|
1
|
+
# `Rawler::Crawler` is responsible for parsing links inside a page
|
2
|
+
|
1
3
|
module Rawler
|
2
4
|
|
3
5
|
class Crawler
|
4
6
|
|
5
|
-
|
7
|
+
# An instance of Rawler::Crawler has a url which represents the url for which we want to parse links.
|
8
|
+
|
9
|
+
attr_accessor :url
|
10
|
+
|
11
|
+
# We want to skip some kind of formats
|
6
12
|
|
7
13
|
SKIP_FORMATS = /^(javascript|mailto)/
|
14
|
+
|
15
|
+
# To use this class, just pass it a url
|
8
16
|
|
9
17
|
def initialize(url)
|
10
18
|
@url = url.strip
|
11
19
|
end
|
20
|
+
|
21
|
+
# And then call `links` to get its links.
|
12
22
|
|
13
23
|
def links
|
24
|
+
# If the url is different than the main Rawler.url, or if the page is not html, we return an empty array
|
14
25
|
if different_domain?(url, Rawler.url) || not_html?(url)
|
15
26
|
return []
|
16
27
|
end
|
17
28
|
|
29
|
+
# Otherwise we fetch the page
|
30
|
+
|
18
31
|
response = Rawler::Request.get(url)
|
32
|
+
|
33
|
+
# And kindly ask nokogiri to convert it for us
|
19
34
|
|
20
35
|
doc = Nokogiri::HTML(response.body)
|
36
|
+
|
37
|
+
# We then do some magic, search all the links in the document that contain a valid link, and return them.
|
21
38
|
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
22
|
-
rescue Errno::ECONNREFUSED
|
39
|
+
rescue Errno::ECONNREFUSED
|
23
40
|
write("Couldn't connect to #{url}")
|
24
41
|
[]
|
25
|
-
rescue Errno::ETIMEDOUT
|
42
|
+
rescue Errno::ETIMEDOUT
|
26
43
|
write("Connection to #{url} timed out")
|
27
44
|
[]
|
28
45
|
end
|
29
46
|
|
30
47
|
private
|
31
48
|
|
49
|
+
# Here's how we transform a relative url to an absolute url
|
50
|
+
|
32
51
|
def absolute_url(path)
|
52
|
+
# First, encode the url
|
33
53
|
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
|
54
|
+
|
55
|
+
# if the url contains a scheme that means it's already absolute
|
34
56
|
if URI.parse(path).scheme
|
35
57
|
path
|
36
58
|
else
|
59
|
+
# Otherwise we merge `url` to get the absolute url
|
37
60
|
URI.parse(url).merge(path).to_s
|
38
61
|
end
|
39
62
|
rescue URI::InvalidURIError
|
@@ -41,7 +64,8 @@ module Rawler
|
|
41
64
|
nil
|
42
65
|
end
|
43
66
|
|
44
|
-
#
|
67
|
+
# Some helper methods
|
68
|
+
|
45
69
|
def write(message)
|
46
70
|
Rawler.output.error(message)
|
47
71
|
end
|
data/lib/rawler/request.rb
CHANGED
data/spec/lib/rawler_spec.rb
CHANGED
@@ -135,6 +135,18 @@ describe Rawler::Base do
|
|
135
135
|
rawler.send(:add_status_code, url, from)
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
it "should rescue from general errors" do
|
140
|
+
url = 'http://example.com'
|
141
|
+
from = 'http://other.com'
|
142
|
+
|
143
|
+
Rawler::Request.should_receive(:get).and_raise
|
144
|
+
|
145
|
+
output.should_receive(:error).with("Unknown error - #{url} - Called from: #{from}")
|
146
|
+
|
147
|
+
rawler.send(:add_status_code, url, from)
|
148
|
+
end
|
149
|
+
|
138
150
|
|
139
151
|
end
|
140
152
|
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'rawler'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
data/test/test_rawler.rb
ADDED
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Oscar Del Ben
|
@@ -14,71 +15,99 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-
|
18
|
-
default_executable:
|
18
|
+
date: 2011-09-16 00:00:00 +02:00
|
19
|
+
default_executable: rawler
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
|
-
name: nokogiri
|
22
22
|
prerelease: false
|
23
|
+
type: :runtime
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
25
|
none: false
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
28
30
|
segments:
|
29
31
|
- 0
|
30
32
|
version: "0"
|
31
|
-
|
33
|
+
name: nokogiri
|
32
34
|
version_requirements: *id001
|
33
35
|
- !ruby/object:Gem::Dependency
|
34
|
-
name: rubyforge
|
35
36
|
prerelease: false
|
37
|
+
type: :development
|
36
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
37
39
|
none: false
|
38
40
|
requirements:
|
39
41
|
- - ">="
|
40
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
41
44
|
segments:
|
42
|
-
- 2
|
43
45
|
- 0
|
44
|
-
|
45
|
-
|
46
|
-
type: :development
|
46
|
+
version: "0"
|
47
|
+
name: shoulda
|
47
48
|
version_requirements: *id002
|
48
49
|
- !ruby/object:Gem::Dependency
|
49
|
-
name: hoe
|
50
50
|
prerelease: false
|
51
|
+
type: :development
|
51
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
52
53
|
none: false
|
53
54
|
requirements:
|
54
|
-
- -
|
55
|
+
- - ~>
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 23
|
58
|
+
segments:
|
59
|
+
- 1
|
60
|
+
- 0
|
61
|
+
- 0
|
62
|
+
version: 1.0.0
|
63
|
+
name: bundler
|
64
|
+
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
prerelease: false
|
67
|
+
type: :development
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ~>
|
55
72
|
- !ruby/object:Gem::Version
|
73
|
+
hash: 7
|
56
74
|
segments:
|
57
|
-
-
|
75
|
+
- 1
|
58
76
|
- 6
|
59
|
-
-
|
60
|
-
version:
|
77
|
+
- 4
|
78
|
+
version: 1.6.4
|
79
|
+
name: jeweler
|
80
|
+
version_requirements: *id004
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
prerelease: false
|
61
83
|
type: :development
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
84
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
hash: 3
|
90
|
+
segments:
|
91
|
+
- 0
|
92
|
+
version: "0"
|
93
|
+
name: rcov
|
94
|
+
version_requirements: *id005
|
95
|
+
description: Rawler is a tool that crawls the links of your website
|
96
|
+
email: info@oscardelben.com
|
69
97
|
executables:
|
70
98
|
- rawler
|
71
99
|
extensions: []
|
72
100
|
|
73
101
|
extra_rdoc_files:
|
74
|
-
-
|
75
|
-
- README.
|
102
|
+
- LICENSE.txt
|
103
|
+
- README.rdoc
|
76
104
|
files:
|
77
105
|
- Gemfile
|
78
106
|
- Gemfile.lock
|
79
|
-
-
|
80
|
-
- README.
|
107
|
+
- LICENSE.txt
|
108
|
+
- README.rdoc
|
81
109
|
- Rakefile
|
110
|
+
- VERSION
|
82
111
|
- bin/rawler
|
83
112
|
- lib/rawler.rb
|
84
113
|
- lib/rawler/base.rb
|
@@ -92,17 +121,16 @@ files:
|
|
92
121
|
- spec/lib/rawler_spec.rb
|
93
122
|
- spec/spec.opts
|
94
123
|
- spec/spec_helper.rb
|
95
|
-
-
|
96
|
-
-
|
124
|
+
- test/helper.rb
|
125
|
+
- test/test_rawler.rb
|
97
126
|
- vendor/lib-trollop.rb
|
98
127
|
has_rdoc: true
|
99
128
|
homepage: http://github.com/oscardelben/rawler
|
100
|
-
licenses:
|
101
|
-
|
129
|
+
licenses:
|
130
|
+
- MIT
|
102
131
|
post_install_message:
|
103
|
-
rdoc_options:
|
104
|
-
|
105
|
-
- README.txt
|
132
|
+
rdoc_options: []
|
133
|
+
|
106
134
|
require_paths:
|
107
135
|
- lib
|
108
136
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -110,6 +138,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
110
138
|
requirements:
|
111
139
|
- - ">="
|
112
140
|
- !ruby/object:Gem::Version
|
141
|
+
hash: 3
|
113
142
|
segments:
|
114
143
|
- 0
|
115
144
|
version: "0"
|
@@ -118,15 +147,16 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
147
|
requirements:
|
119
148
|
- - ">="
|
120
149
|
- !ruby/object:Gem::Version
|
150
|
+
hash: 3
|
121
151
|
segments:
|
122
152
|
- 0
|
123
153
|
version: "0"
|
124
154
|
requirements: []
|
125
155
|
|
126
|
-
rubyforge_project:
|
127
|
-
rubygems_version: 1.
|
156
|
+
rubyforge_project:
|
157
|
+
rubygems_version: 1.6.2
|
128
158
|
signing_key:
|
129
159
|
specification_version: 3
|
130
|
-
summary: Rawler is a
|
160
|
+
summary: Rawler is a tool that crawls the links of your website
|
131
161
|
test_files: []
|
132
162
|
|
data/Manifest.txt
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
Gemfile
|
2
|
-
Gemfile.lock
|
3
|
-
Manifest.txt
|
4
|
-
README.txt
|
5
|
-
Rakefile
|
6
|
-
bin/rawler
|
7
|
-
lib/rawler.rb
|
8
|
-
lib/rawler/base.rb
|
9
|
-
lib/rawler/core_extensions.rb
|
10
|
-
lib/rawler/core_extensions/module.rb
|
11
|
-
lib/rawler/crawler.rb
|
12
|
-
lib/rawler/request.rb
|
13
|
-
spec/lib/base_spec.rb
|
14
|
-
spec/lib/rawler/base_spec.rb
|
15
|
-
spec/lib/rawler/crawler_spec.rb
|
16
|
-
spec/lib/rawler_spec.rb
|
17
|
-
spec/spec.opts
|
18
|
-
spec/spec_helper.rb
|
19
|
-
specs.watchr
|
20
|
-
tasks/rspec.rake
|
21
|
-
vendor/lib-trollop.rb
|
data/specs.watchr
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
# Run me with:
|
2
|
-
#
|
3
|
-
# $ watchr specs.watchr
|
4
|
-
|
5
|
-
# --------------------------------------------------
|
6
|
-
# Convenience Methods
|
7
|
-
# --------------------------------------------------
|
8
|
-
def all_test_files
|
9
|
-
Dir['spec/**/*_spec.rb']
|
10
|
-
end
|
11
|
-
|
12
|
-
def run_test_matching(thing_to_match)
|
13
|
-
matches = all_test_files.grep(/#{thing_to_match}/i)
|
14
|
-
if matches.empty?
|
15
|
-
puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
|
16
|
-
else
|
17
|
-
run matches.join(' ')
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def run(files_to_run)
|
22
|
-
puts("Running: #{files_to_run}")
|
23
|
-
system("clear;rspec -cfs #{files_to_run}")
|
24
|
-
no_int_for_you
|
25
|
-
end
|
26
|
-
|
27
|
-
def run_all_tests
|
28
|
-
run(all_test_files.join(' '))
|
29
|
-
end
|
30
|
-
|
31
|
-
# --------------------------------------------------
|
32
|
-
# Watchr Rules
|
33
|
-
# --------------------------------------------------
|
34
|
-
watch('^spec/(.*)_spec\.rb' ) { |m| run_test_matching(m[1]) }
|
35
|
-
watch('^lib/(.*)\.rb' ) { |m| run_test_matching(m[1]) }
|
36
|
-
watch('^spec/spec_helper\.rb') { run_all_tests }
|
37
|
-
# --------------------------------------------------
|
38
|
-
# Signal Handling
|
39
|
-
# --------------------------------------------------
|
40
|
-
|
41
|
-
def no_int_for_you
|
42
|
-
@sent_an_int = nil
|
43
|
-
end
|
44
|
-
|
45
|
-
Signal.trap 'INT' do
|
46
|
-
if @sent_an_int then
|
47
|
-
puts " A second INT? Ok, I get the message. Shutting down now."
|
48
|
-
exit
|
49
|
-
else
|
50
|
-
puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
|
51
|
-
@sent_an_int = true
|
52
|
-
Kernel.sleep 1.5
|
53
|
-
run_all_tests
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# vim:ft=ruby
|
58
|
-
|
data/tasks/rspec.rake
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
begin
|
2
|
-
require 'spec'
|
3
|
-
rescue LoadError
|
4
|
-
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
5
|
-
require 'spec'
|
6
|
-
end
|
7
|
-
begin
|
8
|
-
require 'spec/rake/spectask'
|
9
|
-
rescue LoadError
|
10
|
-
puts <<-EOS
|
11
|
-
To use rspec for testing you must install rspec gem:
|
12
|
-
gem install rspec
|
13
|
-
EOS
|
14
|
-
exit(0)
|
15
|
-
end
|
16
|
-
|
17
|
-
desc "Run the specs under spec/models"
|
18
|
-
Spec::Rake::SpecTask.new do |t|
|
19
|
-
t.spec_opts = ['--options', "spec/spec.opts"]
|
20
|
-
t.spec_files = FileList['spec/**/*_spec.rb']
|
21
|
-
end
|