rawler 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -2
- data/Gemfile.lock +14 -2
- data/VERSION +1 -1
- data/lib/rawler.rb +1 -58
- data/lib/rawler/base.rb +3 -46
- data/lib/rawler/core_extensions.rb +0 -2
- data/lib/rawler/core_extensions/module.rb +5 -7
- data/lib/rawler/crawler.rb +6 -33
- data/lib/rawler/request.rb +6 -14
- data/rawler.gemspec +9 -3
- data/spec/lib/rawler/crawler_spec.rb +19 -0
- data/spec/lib/rawler_spec.rb +21 -0
- metadata +51 -23
data/Gemfile
CHANGED
@@ -7,8 +7,9 @@ source "http://rubygems.org"
|
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
gem 'nokogiri'
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
group :development, :test do
|
11
|
+
gem 'fakeweb'
|
12
|
+
gem "rspec"
|
12
13
|
gem "shoulda", ">= 0"
|
13
14
|
gem "bundler", "~> 1.0.0"
|
14
15
|
gem "jeweler", "~> 1.6.4"
|
data/Gemfile.lock
CHANGED
@@ -1,14 +1,24 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
fakeweb (1.3.0)
|
4
6
|
git (1.2.5)
|
5
7
|
jeweler (1.6.4)
|
6
8
|
bundler (~> 1.0)
|
7
9
|
git (>= 1.2.5)
|
8
10
|
rake
|
9
11
|
nokogiri (1.5.0)
|
10
|
-
rake (0.9.2)
|
11
|
-
rcov (0.9.
|
12
|
+
rake (0.9.2.2)
|
13
|
+
rcov (0.9.11)
|
14
|
+
rspec (2.7.0)
|
15
|
+
rspec-core (~> 2.7.0)
|
16
|
+
rspec-expectations (~> 2.7.0)
|
17
|
+
rspec-mocks (~> 2.7.0)
|
18
|
+
rspec-core (2.7.1)
|
19
|
+
rspec-expectations (2.7.0)
|
20
|
+
diff-lcs (~> 1.1.2)
|
21
|
+
rspec-mocks (2.7.0)
|
12
22
|
shoulda (2.11.3)
|
13
23
|
|
14
24
|
PLATFORMS
|
@@ -16,7 +26,9 @@ PLATFORMS
|
|
16
26
|
|
17
27
|
DEPENDENCIES
|
18
28
|
bundler (~> 1.0.0)
|
29
|
+
fakeweb
|
19
30
|
jeweler (~> 1.6.4)
|
20
31
|
nokogiri
|
21
32
|
rcov
|
33
|
+
rspec
|
22
34
|
shoulda
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.4
|
data/lib/rawler.rb
CHANGED
@@ -1,82 +1,25 @@
|
|
1
|
-
# **Rawler** is a command line tool for finding broken links on your website.
|
2
|
-
# You can install Rawler by running:
|
3
|
-
#
|
4
|
-
# gem install rawler
|
5
|
-
#
|
6
|
-
# To use Rawler type:
|
7
|
-
#
|
8
|
-
# rawler example.com
|
9
|
-
#
|
10
|
-
# Type `rawler -h` to see all the available options (including basic auth support).
|
11
|
-
|
12
|
-
#### Prerequisites
|
13
|
-
|
14
|
-
|
15
1
|
require 'rubygems'
|
16
|
-
|
17
|
-
# We use [net/https](http://www.ruby-doc.org/stdlib/libdoc/net/http/rdoc/index.html) for making requests.
|
18
|
-
|
19
2
|
require 'net/https'
|
20
|
-
|
21
|
-
# We use [nokogiri](http://nokogiri.org/) for parsing web pages.
|
22
|
-
|
23
3
|
require 'nokogiri'
|
24
|
-
|
25
|
-
# We use the [logger](http://www.ruby-doc.org/stdlib/libdoc/logger/rdoc/) utility for handling the output.
|
26
|
-
|
27
4
|
require 'logger'
|
28
|
-
|
29
|
-
# We require [rawler/core_extensions](rawler/core_extensions.html) which includes some core extensions we need.
|
30
|
-
|
31
5
|
require 'rawler/core_extensions'
|
32
|
-
|
33
|
-
#### The Rawler module
|
34
|
-
|
35
|
-
# The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
|
36
|
-
|
37
6
|
module Rawler
|
38
7
|
VERSION = "#{File.read(File.expand_path(File.dirname(__FILE__)) + '/../VERSION')}"
|
39
|
-
|
40
|
-
# `output` is where we want to direct output. It's set to `$stdout` by default.
|
41
8
|
|
42
9
|
mattr_accessor :output
|
43
|
-
|
44
|
-
# `url` is the url that we want to fetch. We need to keep track of it when parsing other pages to see if they are of the same domain.
|
45
|
-
|
46
10
|
mattr_accessor :url
|
47
|
-
|
48
|
-
# The time we wait between requests, default 3. We don't want to send too many requests to your website!
|
49
|
-
|
50
11
|
mattr_accessor :wait
|
51
|
-
|
52
|
-
# Username and Password for basic auth, if needed.
|
53
|
-
|
54
12
|
mattr_accessor :username, :password
|
55
|
-
|
56
|
-
# Log switch
|
57
|
-
|
58
13
|
mattr_accessor :log
|
59
14
|
|
60
|
-
# Here we autoload when needed the specific namespaces.
|
61
|
-
|
62
|
-
# [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
|
63
|
-
|
64
15
|
autoload :Base, "rawler/base"
|
65
|
-
|
66
|
-
# [Rawler::Crawler](rawler/crawler.html) is responsible for parsing links inside a page.
|
67
|
-
|
68
16
|
autoload :Crawler, "rawler/crawler"
|
69
|
-
|
70
|
-
# [Rawler::Request](rawler/reqeust.html) contains some helper methods for performing requests.
|
71
|
-
|
72
17
|
autoload :Request, "rawler/request"
|
73
18
|
|
74
|
-
# We overwrite url= to automatically add `http://` if needed so that you can simply type `rawler example.com` in the command line.
|
75
|
-
|
76
19
|
def self.url=(url)
|
77
20
|
url.strip!
|
78
21
|
|
79
|
-
if (url =~ /http
|
22
|
+
if (url =~ /http[s]?:\/\//) != 0
|
80
23
|
url = 'http://' + url
|
81
24
|
end
|
82
25
|
|
data/lib/rawler/base.rb
CHANGED
@@ -1,18 +1,7 @@
|
|
1
|
-
#### Rawler workflow
|
2
|
-
|
3
|
-
# `Rawler::Base` is where all the heavy work is being made.
|
4
|
-
# When you call `rawler somesite.com`, we create an instance of Rawler::Base and then call `validate`, which recursively validates all the links relative to the domain that we specified.
|
5
|
-
|
6
1
|
module Rawler
|
7
|
-
|
8
2
|
class Base
|
9
|
-
|
10
|
-
# `responses` is used to keep track of which links we have already parsed, so that we wont parse them again and again.
|
11
|
-
# TODO: rename `responses` to something more meaningful.
|
12
3
|
|
13
4
|
attr_accessor :responses
|
14
|
-
|
15
|
-
# When we instantiate `Rawler::Base` we set some options according to what you specified on the command line.
|
16
5
|
|
17
6
|
def initialize(url, output, options={})
|
18
7
|
@responses = {}
|
@@ -26,18 +15,13 @@ module Rawler
|
|
26
15
|
Rawler.log = options[:log]
|
27
16
|
@logfile = File.new("rawler_log.txt", "w") if Rawler.log
|
28
17
|
end
|
29
|
-
|
30
|
-
# The method used to start the real validation process
|
31
18
|
|
32
19
|
def validate
|
33
20
|
validate_links_in_page(Rawler.url)
|
34
21
|
@logfile.close if Rawler.log
|
35
22
|
end
|
36
|
-
|
23
|
+
|
37
24
|
private
|
38
|
-
|
39
|
-
# We ask [Rawler::Crawler](crawler.html) for all the links in page and then validate each of them individually.
|
40
|
-
# We then sleep for the value of `Rawler.wait` (default 3) between each request to avoid dossing your server.
|
41
25
|
|
42
26
|
def validate_links_in_page(page)
|
43
27
|
Rawler::Crawler.new(page).links.each do |page_url|
|
@@ -45,8 +29,6 @@ module Rawler
|
|
45
29
|
sleep(Rawler.wait)
|
46
30
|
end
|
47
31
|
end
|
48
|
-
|
49
|
-
# If we haven't validated the page yet, we check its status code and then validate all the links in the page if it's in the same domain
|
50
32
|
|
51
33
|
def validate_page(page_url, from_url)
|
52
34
|
if not_yet_parsed?(page_url)
|
@@ -54,22 +36,12 @@ module Rawler
|
|
54
36
|
validate_links_in_page(page_url) if same_domain?(page_url)
|
55
37
|
end
|
56
38
|
end
|
57
|
-
|
58
|
-
# This is where we check the specific page status.
|
59
39
|
|
60
40
|
def add_status_code(link, from_url)
|
61
41
|
response = Rawler::Request.get(link)
|
62
42
|
|
63
|
-
# We follow a redirect if necessary.
|
64
|
-
|
65
43
|
validate_page(response['Location'], from_url) if response['Location']
|
66
|
-
|
67
|
-
# We inform the user about what we got.
|
68
|
-
|
69
44
|
record_response(response.code, link, from_url, response['Location'])
|
70
|
-
|
71
|
-
# We add the current page to `responses` to avoid parsing it again/
|
72
|
-
|
73
45
|
responses[link] = { :status => response.code.to_i }
|
74
46
|
rescue Errno::ECONNREFUSED
|
75
47
|
error("Connection refused - #{link} - Called from: #{from_url}")
|
@@ -79,41 +51,28 @@ module Rawler
|
|
79
51
|
rescue Exception
|
80
52
|
error("Unknown error - #{link} - Called from: #{from_url}")
|
81
53
|
end
|
82
|
-
|
83
|
-
# Some helper methods
|
84
54
|
|
85
55
|
def same_domain?(link)
|
86
56
|
URI.parse(Rawler.url).host == URI.parse(link).host
|
87
57
|
end
|
88
|
-
|
58
|
+
|
89
59
|
def not_yet_parsed?(link)
|
90
60
|
responses[link].nil?
|
91
61
|
end
|
92
|
-
|
62
|
+
|
93
63
|
def error(message)
|
94
64
|
Rawler.output.error(message)
|
95
65
|
end
|
96
66
|
|
97
|
-
# We use this method to inform the user of a page status
|
98
|
-
|
99
67
|
def record_response(code, link, from_url, redirection=nil)
|
100
|
-
|
101
|
-
# By default, we just give the status code and the page url
|
102
|
-
|
103
68
|
message = "#{code} - #{link}"
|
104
69
|
|
105
|
-
# If the status code is more or equal than 300, we also add which url linked the current page
|
106
|
-
|
107
70
|
if code.to_i >= 300
|
108
71
|
message += " - Called from: #{from_url}"
|
109
72
|
end
|
110
73
|
|
111
|
-
# We add information about redirects, if a redirect was set
|
112
|
-
|
113
74
|
message += " - Following redirection to: #{redirection}" if redirection
|
114
75
|
|
115
|
-
# Depending on the status code, we use a different method of logger.
|
116
|
-
|
117
76
|
code = code.to_i
|
118
77
|
case code / 100
|
119
78
|
when 1,2
|
@@ -127,7 +86,5 @@ module Rawler
|
|
127
86
|
end
|
128
87
|
@logfile.puts(message) if Rawler.log
|
129
88
|
end
|
130
|
-
|
131
89
|
end
|
132
|
-
|
133
90
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# Add `attr_accessor` like methods to modules
|
2
|
-
|
3
1
|
class Module
|
4
2
|
def mattr_reader(*syms)
|
5
3
|
syms.each do |sym|
|
@@ -8,7 +6,7 @@ class Module
|
|
8
6
|
unless defined? @@#{sym}
|
9
7
|
@@#{sym} = nil
|
10
8
|
end
|
11
|
-
|
9
|
+
|
12
10
|
def self.#{sym}
|
13
11
|
@@#{sym}
|
14
12
|
end
|
@@ -19,18 +17,18 @@ class Module
|
|
19
17
|
EOS
|
20
18
|
end
|
21
19
|
end
|
22
|
-
|
20
|
+
|
23
21
|
def mattr_writer(*syms)
|
24
22
|
syms.each do |sym|
|
25
23
|
class_eval(<<-EOS, __FILE__, __LINE__)
|
26
24
|
unless defined? @@#{sym}
|
27
25
|
@@#{sym} = nil
|
28
26
|
end
|
29
|
-
|
27
|
+
|
30
28
|
def self.#{sym}=(obj)
|
31
29
|
@@#{sym} = obj
|
32
30
|
end
|
33
|
-
|
31
|
+
|
34
32
|
#{"
|
35
33
|
def #{sym}=(obj)
|
36
34
|
@@#{sym} = obj
|
@@ -39,7 +37,7 @@ class Module
|
|
39
37
|
EOS
|
40
38
|
end
|
41
39
|
end
|
42
|
-
|
40
|
+
|
43
41
|
def mattr_accessor(*syms)
|
44
42
|
mattr_reader(*syms)
|
45
43
|
mattr_writer(*syms)
|
data/lib/rawler/crawler.rb
CHANGED
@@ -1,40 +1,23 @@
|
|
1
|
-
# `Rawler::Crawler` is responsible for parsing links inside a page
|
2
|
-
|
3
1
|
module Rawler
|
4
|
-
|
5
2
|
class Crawler
|
6
|
-
|
7
|
-
# An instance of Rawler::Crawler has a url which represents the url for which we want to parse links.
|
8
3
|
|
9
4
|
attr_accessor :url
|
10
5
|
|
11
|
-
|
12
|
-
|
13
|
-
SKIP_FORMATS = /^(javascript|mailto)/
|
14
|
-
|
15
|
-
# To use this class, just pass it a url
|
6
|
+
SKIP_FORMATS = /^(javascript|mailto|callto)/
|
16
7
|
|
17
8
|
def initialize(url)
|
18
9
|
@url = url.strip
|
19
10
|
end
|
20
11
|
|
21
|
-
# And then call `links` to get its links.
|
22
|
-
|
23
12
|
def links
|
24
|
-
# If the url is different than the main Rawler.url, or if the page is not html, we return an empty array
|
25
13
|
if different_domain?(url, Rawler.url) || not_html?(url)
|
26
14
|
return []
|
27
15
|
end
|
28
|
-
|
29
|
-
# Otherwise we fetch the page
|
30
16
|
|
31
17
|
response = Rawler::Request.get(url)
|
32
18
|
|
33
|
-
# And kindly ask nokogiri to convert it for us
|
34
|
-
|
35
19
|
doc = Nokogiri::HTML(response.body)
|
36
20
|
|
37
|
-
# We then do some magic, search all the links in the document that contain a valid link, and return them.
|
38
21
|
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
39
22
|
rescue Errno::ECONNREFUSED
|
40
23
|
write("Couldn't connect to #{url}")
|
@@ -43,47 +26,39 @@ module Rawler
|
|
43
26
|
write("Connection to #{url} timed out")
|
44
27
|
[]
|
45
28
|
end
|
46
|
-
|
29
|
+
|
47
30
|
private
|
48
|
-
|
49
|
-
# Here's how we transform a relative url to an absolute url
|
50
31
|
|
51
32
|
def absolute_url(path)
|
52
|
-
# First, encode the url
|
53
33
|
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
|
54
34
|
|
55
|
-
# if the url contains a scheme that means it's already absolute
|
56
35
|
if URI.parse(path).scheme
|
57
36
|
path
|
58
37
|
else
|
59
|
-
# Otherwise we merge `url` to get the absolute url
|
60
38
|
URI.parse(url).merge(path).to_s
|
61
39
|
end
|
62
40
|
rescue URI::InvalidURIError
|
63
41
|
write("Invalid url: #{path} - Called from: #{url}")
|
64
42
|
nil
|
65
43
|
end
|
66
|
-
|
67
|
-
# Some helper methods
|
68
44
|
|
69
45
|
def write(message)
|
70
46
|
Rawler.output.error(message)
|
71
47
|
end
|
72
|
-
|
48
|
+
|
73
49
|
def different_domain?(url_1, url_2)
|
74
50
|
URI.parse(url_1).host != URI.parse(url_2).host
|
75
51
|
end
|
76
|
-
|
52
|
+
|
77
53
|
def not_html?(url)
|
78
54
|
Rawler::Request.head(url).content_type != 'text/html'
|
79
55
|
end
|
80
|
-
|
56
|
+
|
81
57
|
def valid_url?(url)
|
82
58
|
return false unless url
|
83
|
-
|
84
59
|
url.strip!
|
60
|
+
|
85
61
|
scheme = URI.parse(url).scheme
|
86
|
-
|
87
62
|
if ['http', 'https'].include?(scheme)
|
88
63
|
true
|
89
64
|
else
|
@@ -95,7 +70,5 @@ module Rawler
|
|
95
70
|
false
|
96
71
|
write("Invalid url - #{url}")
|
97
72
|
end
|
98
|
-
|
99
73
|
end
|
100
|
-
|
101
74
|
end
|
data/lib/rawler/request.rb
CHANGED
@@ -1,25 +1,20 @@
|
|
1
|
-
# `Rawler::Request` contains some abstraction for making web requests, like automatically adding ssl and basic auth.
|
2
|
-
|
3
1
|
module Rawler
|
4
|
-
|
5
2
|
class Request
|
6
|
-
|
7
3
|
class << self
|
8
|
-
|
4
|
+
|
9
5
|
def get(url)
|
10
6
|
perform_request(:get, url)
|
11
7
|
end
|
12
|
-
|
8
|
+
|
13
9
|
def head(url)
|
14
10
|
perform_request(:head, url)
|
15
11
|
end
|
16
|
-
|
12
|
+
|
17
13
|
private
|
18
|
-
|
14
|
+
|
19
15
|
def perform_request(method, url)
|
20
16
|
uri = URI.parse(url)
|
21
17
|
|
22
|
-
# Use http_proxy if set
|
23
18
|
proxy = URI.parse(ENV['http_proxy']) if ENV['http_proxy'] rescue nil
|
24
19
|
if proxy
|
25
20
|
http = Net::HTTP::Proxy(proxy.host, proxy.port).new(uri.host, uri.port)
|
@@ -28,16 +23,13 @@ module Rawler
|
|
28
23
|
end
|
29
24
|
http.use_ssl = (uri.scheme == 'https')
|
30
25
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
31
|
-
|
26
|
+
|
32
27
|
path = (uri.path.size == 0) ? "/" : uri.path
|
33
|
-
|
28
|
+
|
34
29
|
request = Net::HTTP::Get.new(path)
|
35
30
|
request.basic_auth(Rawler.username, Rawler.password)
|
36
31
|
http.request(request)
|
37
32
|
end
|
38
|
-
|
39
33
|
end
|
40
|
-
|
41
34
|
end
|
42
|
-
|
43
35
|
end
|
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2012-02-27"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
|
|
45
45
|
s.homepage = "http://github.com/oscardelben/rawler"
|
46
46
|
s.licenses = ["MIT"]
|
47
47
|
s.require_paths = ["lib"]
|
48
|
-
s.rubygems_version = "1.8.
|
48
|
+
s.rubygems_version = "1.8.10"
|
49
49
|
s.summary = "Rawler is a tool that crawls the links of your website"
|
50
50
|
|
51
51
|
if s.respond_to? :specification_version then
|
@@ -53,12 +53,16 @@ Gem::Specification.new do |s|
|
|
53
53
|
|
54
54
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
55
|
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
56
|
+
s.add_development_dependency(%q<fakeweb>, [">= 0"])
|
57
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
56
58
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
57
59
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
60
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
59
61
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
60
62
|
else
|
61
63
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
64
|
+
s.add_dependency(%q<fakeweb>, [">= 0"])
|
65
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
62
66
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
67
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
68
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
@@ -66,6 +70,8 @@ Gem::Specification.new do |s|
|
|
66
70
|
end
|
67
71
|
else
|
68
72
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
73
|
+
s.add_dependency(%q<fakeweb>, [">= 0"])
|
74
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
69
75
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
70
76
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
71
77
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
@@ -159,6 +159,25 @@ describe Rawler::Crawler do
|
|
159
159
|
crawler.links
|
160
160
|
end
|
161
161
|
end
|
162
|
+
|
163
|
+
context "callto" do
|
164
|
+
let(:url) { 'http://example.com/path' }
|
165
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
166
|
+
let(:content) { "<a href=\"callto:home22\">foo</a><a name=\"foo\">" }
|
167
|
+
|
168
|
+
before(:each) do
|
169
|
+
register(url, content)
|
170
|
+
end
|
171
|
+
|
172
|
+
it "should return empty links" do
|
173
|
+
crawler.links.should == []
|
174
|
+
end
|
175
|
+
|
176
|
+
it "should not report the error" do
|
177
|
+
crawler.should_not_receive(:write)
|
178
|
+
crawler.links
|
179
|
+
end
|
180
|
+
end
|
162
181
|
|
163
182
|
end
|
164
183
|
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -20,6 +20,27 @@ describe Rawler::Base do
|
|
20
20
|
Rawler::Base.new(original, output)
|
21
21
|
Rawler.url.should == expected
|
22
22
|
end
|
23
|
+
|
24
|
+
it "should auto prepend http" do
|
25
|
+
original = 'example.com'
|
26
|
+
expected = 'http://example.com'
|
27
|
+
Rawler::Base.new(original, output)
|
28
|
+
Rawler.url.should == expected
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should not auto prepend http when already http" do
|
32
|
+
original = 'http://example.com'
|
33
|
+
expected = 'http://example.com'
|
34
|
+
Rawler::Base.new(original, output)
|
35
|
+
Rawler.url.should == expected
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should not auto prepend http when https" do
|
39
|
+
original = 'https://example.com'
|
40
|
+
expected = 'https://example.com'
|
41
|
+
Rawler::Base.new(original, output)
|
42
|
+
Rawler.url.should == expected
|
43
|
+
end
|
23
44
|
end
|
24
45
|
|
25
46
|
describe "validate_links" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,10 +15,10 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-02-27 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
|
-
|
21
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
22
22
|
none: false
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
@@ -27,12 +27,26 @@ dependencies:
|
|
27
27
|
segments:
|
28
28
|
- 0
|
29
29
|
version: "0"
|
30
|
-
|
30
|
+
requirement: *id001
|
31
|
+
type: :runtime
|
32
|
+
prerelease: false
|
31
33
|
name: nokogiri
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
hash: 3
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
requirement: *id002
|
45
|
+
type: :development
|
32
46
|
prerelease: false
|
33
|
-
|
47
|
+
name: fakeweb
|
34
48
|
- !ruby/object:Gem::Dependency
|
35
|
-
|
49
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
36
50
|
none: false
|
37
51
|
requirements:
|
38
52
|
- - ">="
|
@@ -41,12 +55,26 @@ dependencies:
|
|
41
55
|
segments:
|
42
56
|
- 0
|
43
57
|
version: "0"
|
44
|
-
|
45
|
-
|
58
|
+
requirement: *id003
|
59
|
+
type: :development
|
46
60
|
prerelease: false
|
61
|
+
name: rspec
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
hash: 3
|
69
|
+
segments:
|
70
|
+
- 0
|
71
|
+
version: "0"
|
72
|
+
requirement: *id004
|
47
73
|
type: :development
|
74
|
+
prerelease: false
|
75
|
+
name: shoulda
|
48
76
|
- !ruby/object:Gem::Dependency
|
49
|
-
|
77
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
50
78
|
none: false
|
51
79
|
requirements:
|
52
80
|
- - ~>
|
@@ -57,12 +85,12 @@ dependencies:
|
|
57
85
|
- 0
|
58
86
|
- 0
|
59
87
|
version: 1.0.0
|
60
|
-
|
61
|
-
name: bundler
|
62
|
-
prerelease: false
|
88
|
+
requirement: *id005
|
63
89
|
type: :development
|
90
|
+
prerelease: false
|
91
|
+
name: bundler
|
64
92
|
- !ruby/object:Gem::Dependency
|
65
|
-
|
93
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
66
94
|
none: false
|
67
95
|
requirements:
|
68
96
|
- - ~>
|
@@ -73,12 +101,12 @@ dependencies:
|
|
73
101
|
- 6
|
74
102
|
- 4
|
75
103
|
version: 1.6.4
|
76
|
-
|
77
|
-
name: jeweler
|
78
|
-
prerelease: false
|
104
|
+
requirement: *id006
|
79
105
|
type: :development
|
106
|
+
prerelease: false
|
107
|
+
name: jeweler
|
80
108
|
- !ruby/object:Gem::Dependency
|
81
|
-
|
109
|
+
version_requirements: &id007 !ruby/object:Gem::Requirement
|
82
110
|
none: false
|
83
111
|
requirements:
|
84
112
|
- - ">="
|
@@ -87,10 +115,10 @@ dependencies:
|
|
87
115
|
segments:
|
88
116
|
- 0
|
89
117
|
version: "0"
|
90
|
-
|
91
|
-
name: rcov
|
92
|
-
prerelease: false
|
118
|
+
requirement: *id007
|
93
119
|
type: :development
|
120
|
+
prerelease: false
|
121
|
+
name: rcov
|
94
122
|
description: Rawler is a tool that crawls the links of your website
|
95
123
|
email: info@oscardelben.com
|
96
124
|
executables:
|
@@ -153,7 +181,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
153
181
|
requirements: []
|
154
182
|
|
155
183
|
rubyforge_project:
|
156
|
-
rubygems_version: 1.8.
|
184
|
+
rubygems_version: 1.8.10
|
157
185
|
signing_key:
|
158
186
|
specification_version: 3
|
159
187
|
summary: Rawler is a tool that crawls the links of your website
|