rawler 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +12 -5
- data/Gemfile.lock +14 -22
- data/LICENSE.txt +20 -0
- data/{README.txt → README.rdoc} +6 -35
- data/Rakefile +58 -24
- data/VERSION +1 -0
- data/bin/rawler +1 -1
- data/lib/rawler.rb +52 -2
- data/lib/rawler/base.rb +45 -5
- data/lib/rawler/core_extensions.rb +3 -1
- data/lib/rawler/core_extensions/module.rb +3 -1
- data/lib/rawler/crawler.rb +28 -4
- data/lib/rawler/request.rb +3 -1
- data/spec/lib/rawler_spec.rb +12 -0
- data/test/helper.rb +18 -0
- data/test/test_rawler.rb +7 -0
- metadata +68 -38
- data/Manifest.txt +0 -21
- data/specs.watchr +0 -58
- data/tasks/rspec.rake +0 -21
data/Gemfile
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
2
5
|
|
3
|
-
gem
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
gem 'nokogiri'
|
4
9
|
|
5
|
-
|
6
|
-
|
7
|
-
gem "
|
8
|
-
gem "
|
10
|
+
|
11
|
+
group :development do
|
12
|
+
gem "shoulda", ">= 0"
|
13
|
+
gem "bundler", "~> 1.0.0"
|
14
|
+
gem "jeweler", "~> 1.6.4"
|
15
|
+
gem "rcov", ">= 0"
|
9
16
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,30 +1,22 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
rspec-core (~> 2.4.0)
|
14
|
-
rspec-expectations (~> 2.4.0)
|
15
|
-
rspec-mocks (~> 2.4.0)
|
16
|
-
rspec-core (2.4.0)
|
17
|
-
rspec-expectations (2.4.0)
|
18
|
-
diff-lcs (~> 1.1.2)
|
19
|
-
rspec-mocks (2.4.0)
|
20
|
-
rubyforge (2.0.4)
|
21
|
-
json_pure (>= 1.1.7)
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.6.4)
|
6
|
+
bundler (~> 1.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
nokogiri (1.5.0)
|
10
|
+
rake (0.9.2)
|
11
|
+
rcov (0.9.9)
|
12
|
+
shoulda (2.11.3)
|
22
13
|
|
23
14
|
PLATFORMS
|
24
15
|
ruby
|
25
16
|
|
26
17
|
DEPENDENCIES
|
27
|
-
|
28
|
-
|
29
|
-
nokogiri
|
30
|
-
|
18
|
+
bundler (~> 1.0.0)
|
19
|
+
jeweler (~> 1.6.4)
|
20
|
+
nokogiri
|
21
|
+
rcov
|
22
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Oscar Del Ben
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/{README.txt → README.rdoc}
RENAMED
@@ -1,14 +1,10 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
* http://github.com/oscardelben/rawler
|
4
|
-
|
5
|
-
== DESCRIPTION:
|
1
|
+
### rawler
|
6
2
|
|
7
3
|
Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
|
8
4
|
|
9
5
|
Rawler will only parse pages with content type 'text/html', but it will check for the response code of every link.
|
10
6
|
|
11
|
-
|
7
|
+
### SYNOPSIS:
|
12
8
|
|
13
9
|
rawler http://example.com [options]
|
14
10
|
|
@@ -19,40 +15,15 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
|
|
19
15
|
--version, -v: Print version and exit
|
20
16
|
--help, -h: Show this message
|
21
17
|
|
22
|
-
|
18
|
+
### INSTALL:
|
23
19
|
|
24
20
|
gem install rawler
|
25
21
|
|
26
|
-
|
27
|
-
|
28
|
-
Run bundle install to install everything you need
|
29
|
-
|
30
|
-
rake test
|
31
|
-
|
32
|
-
To package and run the gem locally:
|
33
|
-
|
34
|
-
rake package
|
35
|
-
cd pkg
|
36
|
-
gem install rawler-#{version}.gem
|
37
|
-
|
38
|
-
If you add files, run:
|
39
|
-
|
40
|
-
rake check_manifest
|
41
|
-
|
42
|
-
And add them to the Manifest file.
|
43
|
-
|
44
|
-
== CONTRIBUTORS:
|
45
|
-
|
46
|
-
* bcoob
|
47
|
-
* Hugh Sasse
|
48
|
-
* Ken Egozi
|
49
|
-
* Robert Glaser
|
50
|
-
* Stefan Schüßler
|
51
|
-
* Vesa Vänskä
|
22
|
+
### CONTRIBUTORS:
|
52
23
|
|
53
|
-
See
|
24
|
+
Many. See [https://github.com/oscardelben/rawler/contributors](https://github.com/oscardelben/rawler/contributors)
|
54
25
|
|
55
|
-
|
26
|
+
### LICENSE:
|
56
27
|
|
57
28
|
(The MIT License)
|
58
29
|
|
data/Rakefile
CHANGED
@@ -1,35 +1,69 @@
|
|
1
|
-
#
|
1
|
+
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
|
-
require '
|
4
|
+
require 'bundler'
|
5
|
+
require 'fileutils'
|
5
6
|
|
6
|
-
|
7
|
-
|
7
|
+
begin
|
8
|
+
Bundler.setup(:default, :development)
|
9
|
+
rescue Bundler::BundlerError => e
|
10
|
+
$stderr.puts e.message
|
11
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
+
exit e.status_code
|
13
|
+
end
|
14
|
+
require 'rake'
|
8
15
|
|
9
|
-
require '
|
10
|
-
|
16
|
+
require 'jeweler'
|
17
|
+
Jeweler::Tasks.new do |gem|
|
18
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
19
|
+
gem.name = "rawler"
|
20
|
+
gem.homepage = "http://github.com/oscardelben/rawler"
|
21
|
+
gem.license = "MIT"
|
22
|
+
gem.summary = %Q{Rawler is a tool that crawls the links of your website}
|
23
|
+
gem.description = %Q{Rawler is a tool that crawls the links of your website}
|
24
|
+
gem.email = "info@oscardelben.com"
|
25
|
+
gem.authors = ["Oscar Del Ben"]
|
26
|
+
gem.executables = ['rawler']
|
27
|
+
# dependencies defined in Gemfile
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
11
30
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
17
37
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
38
|
+
require 'rcov/rcovtask'
|
39
|
+
Rcov::RcovTask.new do |test|
|
40
|
+
test.libs << 'test'
|
41
|
+
test.pattern = 'test/**/test_*.rb'
|
42
|
+
test.verbose = true
|
43
|
+
test.rcov_opts << '--exclude "gems/*"'
|
44
|
+
end
|
22
45
|
|
23
|
-
|
46
|
+
task :default => :test
|
24
47
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
end
|
48
|
+
require 'rake/rdoctask'
|
49
|
+
Rake::RDocTask.new do |rdoc|
|
50
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
29
51
|
|
30
|
-
|
31
|
-
|
32
|
-
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "rawler #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
33
56
|
end
|
34
57
|
|
35
|
-
|
58
|
+
desc 'generate docs'
|
59
|
+
task :rocco do
|
60
|
+
#%x!rm -r html/*!
|
61
|
+
|
62
|
+
Dir.chdir "lib"
|
63
|
+
|
64
|
+
files = Dir['**/*.*']
|
65
|
+
|
66
|
+
files.each do |file|
|
67
|
+
%x!rocco #{file} -o ../html!
|
68
|
+
end
|
69
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
data/bin/rawler
CHANGED
@@ -4,7 +4,7 @@ require 'rawler'
|
|
4
4
|
require File.join(File.dirname(__FILE__), '..', '/vendor/lib-trollop.rb')
|
5
5
|
|
6
6
|
opts = Trollop::options do
|
7
|
-
version "rawler
|
7
|
+
version "rawler #{Rawler::VERSION} (c) 2011 Oscar Del Ben"
|
8
8
|
banner <<-EOS
|
9
9
|
Rawler is a command line utility for parsing links on a website
|
10
10
|
|
data/lib/rawler.rb
CHANGED
@@ -1,24 +1,74 @@
|
|
1
|
+
# **Rawler** is a command line tool for finding broken links on your website.
|
2
|
+
# You can install Rawler by running:
|
3
|
+
#
|
4
|
+
# gem install rawler
|
5
|
+
#
|
6
|
+
# To use Rawler type:
|
7
|
+
#
|
8
|
+
# rawler example.com
|
9
|
+
#
|
10
|
+
# Type `rawler -h` to see all the available options (including basic auth support).
|
11
|
+
|
12
|
+
#### Prerequisites
|
13
|
+
|
14
|
+
|
1
15
|
require 'rubygems'
|
2
|
-
|
16
|
+
|
17
|
+
# We use [net/https](http://www.ruby-doc.org/stdlib/libdoc/net/http/rdoc/index.html) for making requests.
|
18
|
+
|
3
19
|
require 'net/https'
|
20
|
+
|
21
|
+
# We use [nokogiri](http://nokogiri.org/) for parsing web pages.
|
22
|
+
|
4
23
|
require 'nokogiri'
|
24
|
+
|
25
|
+
# We use the [logger](http://www.ruby-doc.org/stdlib/libdoc/logger/rdoc/) utility for handling the output.
|
26
|
+
|
5
27
|
require 'logger'
|
6
28
|
|
29
|
+
# We require [rawler/core_extensions](rawler/core_extensions.html) which includes some core extensions we need.
|
30
|
+
|
7
31
|
require 'rawler/core_extensions'
|
8
32
|
|
33
|
+
#### The Rawler module
|
34
|
+
|
35
|
+
# The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
|
36
|
+
|
9
37
|
module Rawler
|
10
|
-
VERSION = '0.1.
|
38
|
+
VERSION = '0.1.1'
|
11
39
|
|
40
|
+
# `output` is where we want to direct output. It's set to `$stdout` by default.
|
41
|
+
|
12
42
|
mattr_accessor :output
|
43
|
+
|
44
|
+
# `url` is the url that we want to fetch. We need to keep track of it when parsing other pages to see if they are of the same domain.
|
45
|
+
|
13
46
|
mattr_accessor :url
|
47
|
+
|
48
|
+
# The time we wait between requests, default 3. We don't want to send too many requests to your website!
|
49
|
+
|
14
50
|
mattr_accessor :wait
|
15
51
|
|
52
|
+
# Username and Password for basic auth, if needed.
|
53
|
+
|
16
54
|
mattr_accessor :username, :password
|
17
55
|
|
56
|
+
# Here we autoload when needed the specific namespaces.
|
57
|
+
|
58
|
+
# [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
|
59
|
+
|
18
60
|
autoload :Base, "rawler/base"
|
61
|
+
|
62
|
+
# [Rawler::Crawler](rawler/crawler.html) is responsible for parsing links inside a page.
|
63
|
+
|
19
64
|
autoload :Crawler, "rawler/crawler"
|
65
|
+
|
66
|
+
# [Rawler::Request](rawler/reqeust.html) contains some helper methods for performing requests.
|
67
|
+
|
20
68
|
autoload :Request, "rawler/request"
|
21
69
|
|
70
|
+
# We overwrite url= to automatically add `http://` if needed so that you can simply type `rawler example.com` in the command line.
|
71
|
+
|
22
72
|
def self.url=(url)
|
23
73
|
url.strip!
|
24
74
|
|
data/lib/rawler/base.rb
CHANGED
@@ -1,32 +1,50 @@
|
|
1
|
+
#### Rawler workflow
|
2
|
+
|
3
|
+
# `Rawler::Base` is where all the heavy work is being made.
|
4
|
+
# When you call `rawler somesite.com`, we create an instance of Rawler::Base and then call `validate`, which recursively validates all the links relative to the domain that we specified.
|
5
|
+
|
1
6
|
module Rawler
|
2
7
|
|
3
8
|
class Base
|
4
9
|
|
10
|
+
# `responses` is used to keep track of which links we have already parsed, so that we wont parse them again and again.
|
11
|
+
# TODO: rename `responses` to something more meaningful.
|
12
|
+
|
5
13
|
attr_accessor :responses
|
6
14
|
|
15
|
+
# When we instantiate `Rawler::Base` we set some options according to what you specified on the command line.
|
16
|
+
|
7
17
|
def initialize(url, output, options={})
|
8
18
|
@responses = {}
|
9
19
|
|
10
20
|
Rawler.url = URI.escape(url)
|
21
|
+
output.sync = true
|
11
22
|
Rawler.output = Logger.new(output)
|
12
23
|
Rawler.username = options[:username]
|
13
24
|
Rawler.password = options[:password]
|
14
25
|
Rawler.wait = options[:wait].to_i
|
15
26
|
end
|
16
27
|
|
28
|
+
# The method used to start the real validation process
|
29
|
+
|
17
30
|
def validate
|
18
31
|
validate_links_in_page(Rawler.url)
|
19
32
|
end
|
20
33
|
|
21
34
|
private
|
22
35
|
|
23
|
-
|
24
|
-
|
25
|
-
|
36
|
+
# We ask [Rawler::Crawler](crawler.html) for all the links in page and then validate each of them individually.
|
37
|
+
# We then sleep for the value of `Rawler.wait` (default 3) between each request to avoid dossing your server.
|
38
|
+
|
39
|
+
def validate_links_in_page(page)
|
40
|
+
Rawler::Crawler.new(page).links.each do |page_url|
|
41
|
+
validate_page(page_url, page)
|
26
42
|
sleep(Rawler.wait)
|
27
43
|
end
|
28
44
|
end
|
29
45
|
|
46
|
+
# If we haven't validated the page yet, we check its status code and then validate all the links in the page if it's in the same domain
|
47
|
+
|
30
48
|
def validate_page(page_url, from_url)
|
31
49
|
if not_yet_parsed?(page_url)
|
32
50
|
add_status_code(page_url, from_url)
|
@@ -34,20 +52,33 @@ module Rawler
|
|
34
52
|
end
|
35
53
|
end
|
36
54
|
|
55
|
+
# This is where we check the specific page status.
|
56
|
+
|
37
57
|
def add_status_code(link, from_url)
|
38
58
|
response = Rawler::Request.get(link)
|
39
59
|
|
60
|
+
# We follow a redirect if necessary.
|
61
|
+
|
40
62
|
validate_page(response['Location'], from_url) if response['Location']
|
41
63
|
|
64
|
+
# We inform the user about what we got.
|
65
|
+
|
42
66
|
record_response(response.code, link, from_url, response['Location'])
|
67
|
+
|
68
|
+
# We add the current page to `responses` to avoid parsing it again/
|
69
|
+
|
43
70
|
responses[link] = { :status => response.code.to_i }
|
44
71
|
rescue Errno::ECONNREFUSED
|
45
72
|
error("Connection refused - #{link} - Called from: #{from_url}")
|
46
73
|
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
|
47
74
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError
|
48
75
|
error("Connection problems - #{link} - Called from: #{from_url}")
|
76
|
+
rescue Exception
|
77
|
+
error("Unknown error - #{link} - Called from: #{from_url}")
|
49
78
|
end
|
50
79
|
|
80
|
+
# Some helper methods
|
81
|
+
|
51
82
|
def same_domain?(link)
|
52
83
|
URI.parse(Rawler.url).host == URI.parse(link).host
|
53
84
|
end
|
@@ -59,21 +90,30 @@ module Rawler
|
|
59
90
|
def error(message)
|
60
91
|
Rawler.output.error(message)
|
61
92
|
end
|
93
|
+
|
94
|
+
# We use this method to inform the user of a page status
|
62
95
|
|
63
96
|
def record_response(code, link, from_url, redirection=nil)
|
97
|
+
|
98
|
+
# By default, we just give the status code and the page url
|
99
|
+
|
64
100
|
message = "#{code} - #{link}"
|
65
101
|
|
102
|
+
# If the status code is more or equal than 300, we also add which url linked the current page
|
103
|
+
|
66
104
|
if code.to_i >= 300
|
67
105
|
message += " - Called from: #{from_url}"
|
68
106
|
end
|
107
|
+
|
108
|
+
# We add information about redirects, if a redirect was set
|
69
109
|
|
70
110
|
message += " - Following redirection to: #{redirection}" if redirection
|
71
111
|
|
112
|
+
# Depending on the status code, we use a different method of logger.
|
113
|
+
|
72
114
|
code = code.to_i
|
73
115
|
case code / 100
|
74
116
|
when 1
|
75
|
-
# TODO: check that if a 100 is received
|
76
|
-
# then there is another status code as well
|
77
117
|
Rawler.output.info(message)
|
78
118
|
when 2 then
|
79
119
|
Rawler.output.info(message)
|
data/lib/rawler/crawler.rb
CHANGED
@@ -1,39 +1,62 @@
|
|
1
|
+
# `Rawler::Crawler` is responsible for parsing links inside a page
|
2
|
+
|
1
3
|
module Rawler
|
2
4
|
|
3
5
|
class Crawler
|
4
6
|
|
5
|
-
|
7
|
+
# An instance of Rawler::Crawler has a url which represents the url for which we want to parse links.
|
8
|
+
|
9
|
+
attr_accessor :url
|
10
|
+
|
11
|
+
# We want to skip some kind of formats
|
6
12
|
|
7
13
|
SKIP_FORMATS = /^(javascript|mailto)/
|
14
|
+
|
15
|
+
# To use this class, just pass it a url
|
8
16
|
|
9
17
|
def initialize(url)
|
10
18
|
@url = url.strip
|
11
19
|
end
|
20
|
+
|
21
|
+
# And then call `links` to get its links.
|
12
22
|
|
13
23
|
def links
|
24
|
+
# If the url is different than the main Rawler.url, or if the page is not html, we return an empty array
|
14
25
|
if different_domain?(url, Rawler.url) || not_html?(url)
|
15
26
|
return []
|
16
27
|
end
|
17
28
|
|
29
|
+
# Otherwise we fetch the page
|
30
|
+
|
18
31
|
response = Rawler::Request.get(url)
|
32
|
+
|
33
|
+
# And kindly ask nokogiri to convert it for us
|
19
34
|
|
20
35
|
doc = Nokogiri::HTML(response.body)
|
36
|
+
|
37
|
+
# We then do some magic, search all the links in the document that contain a valid link, and return them.
|
21
38
|
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
22
|
-
rescue Errno::ECONNREFUSED
|
39
|
+
rescue Errno::ECONNREFUSED
|
23
40
|
write("Couldn't connect to #{url}")
|
24
41
|
[]
|
25
|
-
rescue Errno::ETIMEDOUT
|
42
|
+
rescue Errno::ETIMEDOUT
|
26
43
|
write("Connection to #{url} timed out")
|
27
44
|
[]
|
28
45
|
end
|
29
46
|
|
30
47
|
private
|
31
48
|
|
49
|
+
# Here's how we transform a relative url to an absolute url
|
50
|
+
|
32
51
|
def absolute_url(path)
|
52
|
+
# First, encode the url
|
33
53
|
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
|
54
|
+
|
55
|
+
# if the url contains a scheme that means it's already absolute
|
34
56
|
if URI.parse(path).scheme
|
35
57
|
path
|
36
58
|
else
|
59
|
+
# Otherwise we merge `url` to get the absolute url
|
37
60
|
URI.parse(url).merge(path).to_s
|
38
61
|
end
|
39
62
|
rescue URI::InvalidURIError
|
@@ -41,7 +64,8 @@ module Rawler
|
|
41
64
|
nil
|
42
65
|
end
|
43
66
|
|
44
|
-
#
|
67
|
+
# Some helper methods
|
68
|
+
|
45
69
|
def write(message)
|
46
70
|
Rawler.output.error(message)
|
47
71
|
end
|
data/lib/rawler/request.rb
CHANGED
data/spec/lib/rawler_spec.rb
CHANGED
@@ -135,6 +135,18 @@ describe Rawler::Base do
|
|
135
135
|
rawler.send(:add_status_code, url, from)
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
it "should rescue from general errors" do
|
140
|
+
url = 'http://example.com'
|
141
|
+
from = 'http://other.com'
|
142
|
+
|
143
|
+
Rawler::Request.should_receive(:get).and_raise
|
144
|
+
|
145
|
+
output.should_receive(:error).with("Unknown error - #{url} - Called from: #{from}")
|
146
|
+
|
147
|
+
rawler.send(:add_status_code, url, from)
|
148
|
+
end
|
149
|
+
|
138
150
|
|
139
151
|
end
|
140
152
|
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'rawler'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
data/test/test_rawler.rb
ADDED
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Oscar Del Ben
|
@@ -14,71 +15,99 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-
|
18
|
-
default_executable:
|
18
|
+
date: 2011-09-16 00:00:00 +02:00
|
19
|
+
default_executable: rawler
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
|
-
name: nokogiri
|
22
22
|
prerelease: false
|
23
|
+
type: :runtime
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
25
|
none: false
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
28
30
|
segments:
|
29
31
|
- 0
|
30
32
|
version: "0"
|
31
|
-
|
33
|
+
name: nokogiri
|
32
34
|
version_requirements: *id001
|
33
35
|
- !ruby/object:Gem::Dependency
|
34
|
-
name: rubyforge
|
35
36
|
prerelease: false
|
37
|
+
type: :development
|
36
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
37
39
|
none: false
|
38
40
|
requirements:
|
39
41
|
- - ">="
|
40
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
41
44
|
segments:
|
42
|
-
- 2
|
43
45
|
- 0
|
44
|
-
|
45
|
-
|
46
|
-
type: :development
|
46
|
+
version: "0"
|
47
|
+
name: shoulda
|
47
48
|
version_requirements: *id002
|
48
49
|
- !ruby/object:Gem::Dependency
|
49
|
-
name: hoe
|
50
50
|
prerelease: false
|
51
|
+
type: :development
|
51
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
52
53
|
none: false
|
53
54
|
requirements:
|
54
|
-
- -
|
55
|
+
- - ~>
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 23
|
58
|
+
segments:
|
59
|
+
- 1
|
60
|
+
- 0
|
61
|
+
- 0
|
62
|
+
version: 1.0.0
|
63
|
+
name: bundler
|
64
|
+
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
prerelease: false
|
67
|
+
type: :development
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ~>
|
55
72
|
- !ruby/object:Gem::Version
|
73
|
+
hash: 7
|
56
74
|
segments:
|
57
|
-
-
|
75
|
+
- 1
|
58
76
|
- 6
|
59
|
-
-
|
60
|
-
version:
|
77
|
+
- 4
|
78
|
+
version: 1.6.4
|
79
|
+
name: jeweler
|
80
|
+
version_requirements: *id004
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
prerelease: false
|
61
83
|
type: :development
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
84
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
hash: 3
|
90
|
+
segments:
|
91
|
+
- 0
|
92
|
+
version: "0"
|
93
|
+
name: rcov
|
94
|
+
version_requirements: *id005
|
95
|
+
description: Rawler is a tool that crawls the links of your website
|
96
|
+
email: info@oscardelben.com
|
69
97
|
executables:
|
70
98
|
- rawler
|
71
99
|
extensions: []
|
72
100
|
|
73
101
|
extra_rdoc_files:
|
74
|
-
-
|
75
|
-
- README.
|
102
|
+
- LICENSE.txt
|
103
|
+
- README.rdoc
|
76
104
|
files:
|
77
105
|
- Gemfile
|
78
106
|
- Gemfile.lock
|
79
|
-
-
|
80
|
-
- README.
|
107
|
+
- LICENSE.txt
|
108
|
+
- README.rdoc
|
81
109
|
- Rakefile
|
110
|
+
- VERSION
|
82
111
|
- bin/rawler
|
83
112
|
- lib/rawler.rb
|
84
113
|
- lib/rawler/base.rb
|
@@ -92,17 +121,16 @@ files:
|
|
92
121
|
- spec/lib/rawler_spec.rb
|
93
122
|
- spec/spec.opts
|
94
123
|
- spec/spec_helper.rb
|
95
|
-
-
|
96
|
-
-
|
124
|
+
- test/helper.rb
|
125
|
+
- test/test_rawler.rb
|
97
126
|
- vendor/lib-trollop.rb
|
98
127
|
has_rdoc: true
|
99
128
|
homepage: http://github.com/oscardelben/rawler
|
100
|
-
licenses:
|
101
|
-
|
129
|
+
licenses:
|
130
|
+
- MIT
|
102
131
|
post_install_message:
|
103
|
-
rdoc_options:
|
104
|
-
|
105
|
-
- README.txt
|
132
|
+
rdoc_options: []
|
133
|
+
|
106
134
|
require_paths:
|
107
135
|
- lib
|
108
136
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -110,6 +138,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
110
138
|
requirements:
|
111
139
|
- - ">="
|
112
140
|
- !ruby/object:Gem::Version
|
141
|
+
hash: 3
|
113
142
|
segments:
|
114
143
|
- 0
|
115
144
|
version: "0"
|
@@ -118,15 +147,16 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
147
|
requirements:
|
119
148
|
- - ">="
|
120
149
|
- !ruby/object:Gem::Version
|
150
|
+
hash: 3
|
121
151
|
segments:
|
122
152
|
- 0
|
123
153
|
version: "0"
|
124
154
|
requirements: []
|
125
155
|
|
126
|
-
rubyforge_project:
|
127
|
-
rubygems_version: 1.
|
156
|
+
rubyforge_project:
|
157
|
+
rubygems_version: 1.6.2
|
128
158
|
signing_key:
|
129
159
|
specification_version: 3
|
130
|
-
summary: Rawler is a
|
160
|
+
summary: Rawler is a tool that crawls the links of your website
|
131
161
|
test_files: []
|
132
162
|
|
data/Manifest.txt
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
Gemfile
|
2
|
-
Gemfile.lock
|
3
|
-
Manifest.txt
|
4
|
-
README.txt
|
5
|
-
Rakefile
|
6
|
-
bin/rawler
|
7
|
-
lib/rawler.rb
|
8
|
-
lib/rawler/base.rb
|
9
|
-
lib/rawler/core_extensions.rb
|
10
|
-
lib/rawler/core_extensions/module.rb
|
11
|
-
lib/rawler/crawler.rb
|
12
|
-
lib/rawler/request.rb
|
13
|
-
spec/lib/base_spec.rb
|
14
|
-
spec/lib/rawler/base_spec.rb
|
15
|
-
spec/lib/rawler/crawler_spec.rb
|
16
|
-
spec/lib/rawler_spec.rb
|
17
|
-
spec/spec.opts
|
18
|
-
spec/spec_helper.rb
|
19
|
-
specs.watchr
|
20
|
-
tasks/rspec.rake
|
21
|
-
vendor/lib-trollop.rb
|
data/specs.watchr
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
# Run me with:
|
2
|
-
#
|
3
|
-
# $ watchr specs.watchr
|
4
|
-
|
5
|
-
# --------------------------------------------------
|
6
|
-
# Convenience Methods
|
7
|
-
# --------------------------------------------------
|
8
|
-
def all_test_files
|
9
|
-
Dir['spec/**/*_spec.rb']
|
10
|
-
end
|
11
|
-
|
12
|
-
def run_test_matching(thing_to_match)
|
13
|
-
matches = all_test_files.grep(/#{thing_to_match}/i)
|
14
|
-
if matches.empty?
|
15
|
-
puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
|
16
|
-
else
|
17
|
-
run matches.join(' ')
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def run(files_to_run)
|
22
|
-
puts("Running: #{files_to_run}")
|
23
|
-
system("clear;rspec -cfs #{files_to_run}")
|
24
|
-
no_int_for_you
|
25
|
-
end
|
26
|
-
|
27
|
-
def run_all_tests
|
28
|
-
run(all_test_files.join(' '))
|
29
|
-
end
|
30
|
-
|
31
|
-
# --------------------------------------------------
|
32
|
-
# Watchr Rules
|
33
|
-
# --------------------------------------------------
|
34
|
-
watch('^spec/(.*)_spec\.rb' ) { |m| run_test_matching(m[1]) }
|
35
|
-
watch('^lib/(.*)\.rb' ) { |m| run_test_matching(m[1]) }
|
36
|
-
watch('^spec/spec_helper\.rb') { run_all_tests }
|
37
|
-
# --------------------------------------------------
|
38
|
-
# Signal Handling
|
39
|
-
# --------------------------------------------------
|
40
|
-
|
41
|
-
def no_int_for_you
|
42
|
-
@sent_an_int = nil
|
43
|
-
end
|
44
|
-
|
45
|
-
Signal.trap 'INT' do
|
46
|
-
if @sent_an_int then
|
47
|
-
puts " A second INT? Ok, I get the message. Shutting down now."
|
48
|
-
exit
|
49
|
-
else
|
50
|
-
puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
|
51
|
-
@sent_an_int = true
|
52
|
-
Kernel.sleep 1.5
|
53
|
-
run_all_tests
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# vim:ft=ruby
|
58
|
-
|
data/tasks/rspec.rake
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
begin
|
2
|
-
require 'spec'
|
3
|
-
rescue LoadError
|
4
|
-
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
5
|
-
require 'spec'
|
6
|
-
end
|
7
|
-
begin
|
8
|
-
require 'spec/rake/spectask'
|
9
|
-
rescue LoadError
|
10
|
-
puts <<-EOS
|
11
|
-
To use rspec for testing you must install rspec gem:
|
12
|
-
gem install rspec
|
13
|
-
EOS
|
14
|
-
exit(0)
|
15
|
-
end
|
16
|
-
|
17
|
-
desc "Run the specs under spec/models"
|
18
|
-
Spec::Rake::SpecTask.new do |t|
|
19
|
-
t.spec_opts = ['--options', "spec/spec.opts"]
|
20
|
-
t.spec_files = FileList['spec/**/*_spec.rb']
|
21
|
-
end
|