rawler 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -2
- data/Gemfile.lock +14 -2
- data/VERSION +1 -1
- data/lib/rawler.rb +1 -58
- data/lib/rawler/base.rb +3 -46
- data/lib/rawler/core_extensions.rb +0 -2
- data/lib/rawler/core_extensions/module.rb +5 -7
- data/lib/rawler/crawler.rb +6 -33
- data/lib/rawler/request.rb +6 -14
- data/rawler.gemspec +9 -3
- data/spec/lib/rawler/crawler_spec.rb +19 -0
- data/spec/lib/rawler_spec.rb +21 -0
- metadata +51 -23
data/Gemfile
CHANGED
@@ -7,8 +7,9 @@ source "http://rubygems.org"
|
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
gem 'nokogiri'
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
group :development, :test do
|
11
|
+
gem 'fakeweb'
|
12
|
+
gem "rspec"
|
12
13
|
gem "shoulda", ">= 0"
|
13
14
|
gem "bundler", "~> 1.0.0"
|
14
15
|
gem "jeweler", "~> 1.6.4"
|
data/Gemfile.lock
CHANGED
@@ -1,14 +1,24 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
fakeweb (1.3.0)
|
4
6
|
git (1.2.5)
|
5
7
|
jeweler (1.6.4)
|
6
8
|
bundler (~> 1.0)
|
7
9
|
git (>= 1.2.5)
|
8
10
|
rake
|
9
11
|
nokogiri (1.5.0)
|
10
|
-
rake (0.9.2)
|
11
|
-
rcov (0.9.
|
12
|
+
rake (0.9.2.2)
|
13
|
+
rcov (0.9.11)
|
14
|
+
rspec (2.7.0)
|
15
|
+
rspec-core (~> 2.7.0)
|
16
|
+
rspec-expectations (~> 2.7.0)
|
17
|
+
rspec-mocks (~> 2.7.0)
|
18
|
+
rspec-core (2.7.1)
|
19
|
+
rspec-expectations (2.7.0)
|
20
|
+
diff-lcs (~> 1.1.2)
|
21
|
+
rspec-mocks (2.7.0)
|
12
22
|
shoulda (2.11.3)
|
13
23
|
|
14
24
|
PLATFORMS
|
@@ -16,7 +26,9 @@ PLATFORMS
|
|
16
26
|
|
17
27
|
DEPENDENCIES
|
18
28
|
bundler (~> 1.0.0)
|
29
|
+
fakeweb
|
19
30
|
jeweler (~> 1.6.4)
|
20
31
|
nokogiri
|
21
32
|
rcov
|
33
|
+
rspec
|
22
34
|
shoulda
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.4
|
data/lib/rawler.rb
CHANGED
@@ -1,82 +1,25 @@
|
|
1
|
-
# **Rawler** is a command line tool for finding broken links on your website.
|
2
|
-
# You can install Rawler by running:
|
3
|
-
#
|
4
|
-
# gem install rawler
|
5
|
-
#
|
6
|
-
# To use Rawler type:
|
7
|
-
#
|
8
|
-
# rawler example.com
|
9
|
-
#
|
10
|
-
# Type `rawler -h` to see all the available options (including basic auth support).
|
11
|
-
|
12
|
-
#### Prerequisites
|
13
|
-
|
14
|
-
|
15
1
|
require 'rubygems'
|
16
|
-
|
17
|
-
# We use [net/https](http://www.ruby-doc.org/stdlib/libdoc/net/http/rdoc/index.html) for making requests.
|
18
|
-
|
19
2
|
require 'net/https'
|
20
|
-
|
21
|
-
# We use [nokogiri](http://nokogiri.org/) for parsing web pages.
|
22
|
-
|
23
3
|
require 'nokogiri'
|
24
|
-
|
25
|
-
# We use the [logger](http://www.ruby-doc.org/stdlib/libdoc/logger/rdoc/) utility for handling the output.
|
26
|
-
|
27
4
|
require 'logger'
|
28
|
-
|
29
|
-
# We require [rawler/core_extensions](rawler/core_extensions.html) which includes some core extensions we need.
|
30
|
-
|
31
5
|
require 'rawler/core_extensions'
|
32
|
-
|
33
|
-
#### The Rawler module
|
34
|
-
|
35
|
-
# The Rawler module itself is very simple, and it's only used for storing configuration data like the url that we want to fetch, basic username and password.
|
36
|
-
|
37
6
|
module Rawler
|
38
7
|
VERSION = "#{File.read(File.expand_path(File.dirname(__FILE__)) + '/../VERSION')}"
|
39
|
-
|
40
|
-
# `output` is where we want to direct output. It's set to `$stdout` by default.
|
41
8
|
|
42
9
|
mattr_accessor :output
|
43
|
-
|
44
|
-
# `url` is the url that we want to fetch. We need to keep track of it when parsing other pages to see if they are of the same domain.
|
45
|
-
|
46
10
|
mattr_accessor :url
|
47
|
-
|
48
|
-
# The time we wait between requests, default 3. We don't want to send too many requests to your website!
|
49
|
-
|
50
11
|
mattr_accessor :wait
|
51
|
-
|
52
|
-
# Username and Password for basic auth, if needed.
|
53
|
-
|
54
12
|
mattr_accessor :username, :password
|
55
|
-
|
56
|
-
# Log switch
|
57
|
-
|
58
13
|
mattr_accessor :log
|
59
14
|
|
60
|
-
# Here we autoload when needed the specific namespaces.
|
61
|
-
|
62
|
-
# [Rawler::Base](rawler/base.html) is responsible for validating all the pages in a domain. It's where all the magic happens.
|
63
|
-
|
64
15
|
autoload :Base, "rawler/base"
|
65
|
-
|
66
|
-
# [Rawler::Crawler](rawler/crawler.html) is responsible for parsing links inside a page.
|
67
|
-
|
68
16
|
autoload :Crawler, "rawler/crawler"
|
69
|
-
|
70
|
-
# [Rawler::Request](rawler/reqeust.html) contains some helper methods for performing requests.
|
71
|
-
|
72
17
|
autoload :Request, "rawler/request"
|
73
18
|
|
74
|
-
# We overwrite url= to automatically add `http://` if needed so that you can simply type `rawler example.com` in the command line.
|
75
|
-
|
76
19
|
def self.url=(url)
|
77
20
|
url.strip!
|
78
21
|
|
79
|
-
if (url =~ /http
|
22
|
+
if (url =~ /http[s]?:\/\//) != 0
|
80
23
|
url = 'http://' + url
|
81
24
|
end
|
82
25
|
|
data/lib/rawler/base.rb
CHANGED
@@ -1,18 +1,7 @@
|
|
1
|
-
#### Rawler workflow
|
2
|
-
|
3
|
-
# `Rawler::Base` is where all the heavy work is being made.
|
4
|
-
# When you call `rawler somesite.com`, we create an instance of Rawler::Base and then call `validate`, which recursively validates all the links relative to the domain that we specified.
|
5
|
-
|
6
1
|
module Rawler
|
7
|
-
|
8
2
|
class Base
|
9
|
-
|
10
|
-
# `responses` is used to keep track of which links we have already parsed, so that we wont parse them again and again.
|
11
|
-
# TODO: rename `responses` to something more meaningful.
|
12
3
|
|
13
4
|
attr_accessor :responses
|
14
|
-
|
15
|
-
# When we instantiate `Rawler::Base` we set some options according to what you specified on the command line.
|
16
5
|
|
17
6
|
def initialize(url, output, options={})
|
18
7
|
@responses = {}
|
@@ -26,18 +15,13 @@ module Rawler
|
|
26
15
|
Rawler.log = options[:log]
|
27
16
|
@logfile = File.new("rawler_log.txt", "w") if Rawler.log
|
28
17
|
end
|
29
|
-
|
30
|
-
# The method used to start the real validation process
|
31
18
|
|
32
19
|
def validate
|
33
20
|
validate_links_in_page(Rawler.url)
|
34
21
|
@logfile.close if Rawler.log
|
35
22
|
end
|
36
|
-
|
23
|
+
|
37
24
|
private
|
38
|
-
|
39
|
-
# We ask [Rawler::Crawler](crawler.html) for all the links in page and then validate each of them individually.
|
40
|
-
# We then sleep for the value of `Rawler.wait` (default 3) between each request to avoid dossing your server.
|
41
25
|
|
42
26
|
def validate_links_in_page(page)
|
43
27
|
Rawler::Crawler.new(page).links.each do |page_url|
|
@@ -45,8 +29,6 @@ module Rawler
|
|
45
29
|
sleep(Rawler.wait)
|
46
30
|
end
|
47
31
|
end
|
48
|
-
|
49
|
-
# If we haven't validated the page yet, we check its status code and then validate all the links in the page if it's in the same domain
|
50
32
|
|
51
33
|
def validate_page(page_url, from_url)
|
52
34
|
if not_yet_parsed?(page_url)
|
@@ -54,22 +36,12 @@ module Rawler
|
|
54
36
|
validate_links_in_page(page_url) if same_domain?(page_url)
|
55
37
|
end
|
56
38
|
end
|
57
|
-
|
58
|
-
# This is where we check the specific page status.
|
59
39
|
|
60
40
|
def add_status_code(link, from_url)
|
61
41
|
response = Rawler::Request.get(link)
|
62
42
|
|
63
|
-
# We follow a redirect if necessary.
|
64
|
-
|
65
43
|
validate_page(response['Location'], from_url) if response['Location']
|
66
|
-
|
67
|
-
# We inform the user about what we got.
|
68
|
-
|
69
44
|
record_response(response.code, link, from_url, response['Location'])
|
70
|
-
|
71
|
-
# We add the current page to `responses` to avoid parsing it again/
|
72
|
-
|
73
45
|
responses[link] = { :status => response.code.to_i }
|
74
46
|
rescue Errno::ECONNREFUSED
|
75
47
|
error("Connection refused - #{link} - Called from: #{from_url}")
|
@@ -79,41 +51,28 @@ module Rawler
|
|
79
51
|
rescue Exception
|
80
52
|
error("Unknown error - #{link} - Called from: #{from_url}")
|
81
53
|
end
|
82
|
-
|
83
|
-
# Some helper methods
|
84
54
|
|
85
55
|
def same_domain?(link)
|
86
56
|
URI.parse(Rawler.url).host == URI.parse(link).host
|
87
57
|
end
|
88
|
-
|
58
|
+
|
89
59
|
def not_yet_parsed?(link)
|
90
60
|
responses[link].nil?
|
91
61
|
end
|
92
|
-
|
62
|
+
|
93
63
|
def error(message)
|
94
64
|
Rawler.output.error(message)
|
95
65
|
end
|
96
66
|
|
97
|
-
# We use this method to inform the user of a page status
|
98
|
-
|
99
67
|
def record_response(code, link, from_url, redirection=nil)
|
100
|
-
|
101
|
-
# By default, we just give the status code and the page url
|
102
|
-
|
103
68
|
message = "#{code} - #{link}"
|
104
69
|
|
105
|
-
# If the status code is more or equal than 300, we also add which url linked the current page
|
106
|
-
|
107
70
|
if code.to_i >= 300
|
108
71
|
message += " - Called from: #{from_url}"
|
109
72
|
end
|
110
73
|
|
111
|
-
# We add information about redirects, if a redirect was set
|
112
|
-
|
113
74
|
message += " - Following redirection to: #{redirection}" if redirection
|
114
75
|
|
115
|
-
# Depending on the status code, we use a different method of logger.
|
116
|
-
|
117
76
|
code = code.to_i
|
118
77
|
case code / 100
|
119
78
|
when 1,2
|
@@ -127,7 +86,5 @@ module Rawler
|
|
127
86
|
end
|
128
87
|
@logfile.puts(message) if Rawler.log
|
129
88
|
end
|
130
|
-
|
131
89
|
end
|
132
|
-
|
133
90
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# Add `attr_accessor` like methods to modules
|
2
|
-
|
3
1
|
class Module
|
4
2
|
def mattr_reader(*syms)
|
5
3
|
syms.each do |sym|
|
@@ -8,7 +6,7 @@ class Module
|
|
8
6
|
unless defined? @@#{sym}
|
9
7
|
@@#{sym} = nil
|
10
8
|
end
|
11
|
-
|
9
|
+
|
12
10
|
def self.#{sym}
|
13
11
|
@@#{sym}
|
14
12
|
end
|
@@ -19,18 +17,18 @@ class Module
|
|
19
17
|
EOS
|
20
18
|
end
|
21
19
|
end
|
22
|
-
|
20
|
+
|
23
21
|
def mattr_writer(*syms)
|
24
22
|
syms.each do |sym|
|
25
23
|
class_eval(<<-EOS, __FILE__, __LINE__)
|
26
24
|
unless defined? @@#{sym}
|
27
25
|
@@#{sym} = nil
|
28
26
|
end
|
29
|
-
|
27
|
+
|
30
28
|
def self.#{sym}=(obj)
|
31
29
|
@@#{sym} = obj
|
32
30
|
end
|
33
|
-
|
31
|
+
|
34
32
|
#{"
|
35
33
|
def #{sym}=(obj)
|
36
34
|
@@#{sym} = obj
|
@@ -39,7 +37,7 @@ class Module
|
|
39
37
|
EOS
|
40
38
|
end
|
41
39
|
end
|
42
|
-
|
40
|
+
|
43
41
|
def mattr_accessor(*syms)
|
44
42
|
mattr_reader(*syms)
|
45
43
|
mattr_writer(*syms)
|
data/lib/rawler/crawler.rb
CHANGED
@@ -1,40 +1,23 @@
|
|
1
|
-
# `Rawler::Crawler` is responsible for parsing links inside a page
|
2
|
-
|
3
1
|
module Rawler
|
4
|
-
|
5
2
|
class Crawler
|
6
|
-
|
7
|
-
# An instance of Rawler::Crawler has a url which represents the url for which we want to parse links.
|
8
3
|
|
9
4
|
attr_accessor :url
|
10
5
|
|
11
|
-
|
12
|
-
|
13
|
-
SKIP_FORMATS = /^(javascript|mailto)/
|
14
|
-
|
15
|
-
# To use this class, just pass it a url
|
6
|
+
SKIP_FORMATS = /^(javascript|mailto|callto)/
|
16
7
|
|
17
8
|
def initialize(url)
|
18
9
|
@url = url.strip
|
19
10
|
end
|
20
11
|
|
21
|
-
# And then call `links` to get its links.
|
22
|
-
|
23
12
|
def links
|
24
|
-
# If the url is different than the main Rawler.url, or if the page is not html, we return an empty array
|
25
13
|
if different_domain?(url, Rawler.url) || not_html?(url)
|
26
14
|
return []
|
27
15
|
end
|
28
|
-
|
29
|
-
# Otherwise we fetch the page
|
30
16
|
|
31
17
|
response = Rawler::Request.get(url)
|
32
18
|
|
33
|
-
# And kindly ask nokogiri to convert it for us
|
34
|
-
|
35
19
|
doc = Nokogiri::HTML(response.body)
|
36
20
|
|
37
|
-
# We then do some magic, search all the links in the document that contain a valid link, and return them.
|
38
21
|
doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
|
39
22
|
rescue Errno::ECONNREFUSED
|
40
23
|
write("Couldn't connect to #{url}")
|
@@ -43,47 +26,39 @@ module Rawler
|
|
43
26
|
write("Connection to #{url} timed out")
|
44
27
|
[]
|
45
28
|
end
|
46
|
-
|
29
|
+
|
47
30
|
private
|
48
|
-
|
49
|
-
# Here's how we transform a relative url to an absolute url
|
50
31
|
|
51
32
|
def absolute_url(path)
|
52
|
-
# First, encode the url
|
53
33
|
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))
|
54
34
|
|
55
|
-
# if the url contains a scheme that means it's already absolute
|
56
35
|
if URI.parse(path).scheme
|
57
36
|
path
|
58
37
|
else
|
59
|
-
# Otherwise we merge `url` to get the absolute url
|
60
38
|
URI.parse(url).merge(path).to_s
|
61
39
|
end
|
62
40
|
rescue URI::InvalidURIError
|
63
41
|
write("Invalid url: #{path} - Called from: #{url}")
|
64
42
|
nil
|
65
43
|
end
|
66
|
-
|
67
|
-
# Some helper methods
|
68
44
|
|
69
45
|
def write(message)
|
70
46
|
Rawler.output.error(message)
|
71
47
|
end
|
72
|
-
|
48
|
+
|
73
49
|
def different_domain?(url_1, url_2)
|
74
50
|
URI.parse(url_1).host != URI.parse(url_2).host
|
75
51
|
end
|
76
|
-
|
52
|
+
|
77
53
|
def not_html?(url)
|
78
54
|
Rawler::Request.head(url).content_type != 'text/html'
|
79
55
|
end
|
80
|
-
|
56
|
+
|
81
57
|
def valid_url?(url)
|
82
58
|
return false unless url
|
83
|
-
|
84
59
|
url.strip!
|
60
|
+
|
85
61
|
scheme = URI.parse(url).scheme
|
86
|
-
|
87
62
|
if ['http', 'https'].include?(scheme)
|
88
63
|
true
|
89
64
|
else
|
@@ -95,7 +70,5 @@ module Rawler
|
|
95
70
|
false
|
96
71
|
write("Invalid url - #{url}")
|
97
72
|
end
|
98
|
-
|
99
73
|
end
|
100
|
-
|
101
74
|
end
|
data/lib/rawler/request.rb
CHANGED
@@ -1,25 +1,20 @@
|
|
1
|
-
# `Rawler::Request` contains some abstraction for making web requests, like automatically adding ssl and basic auth.
|
2
|
-
|
3
1
|
module Rawler
|
4
|
-
|
5
2
|
class Request
|
6
|
-
|
7
3
|
class << self
|
8
|
-
|
4
|
+
|
9
5
|
def get(url)
|
10
6
|
perform_request(:get, url)
|
11
7
|
end
|
12
|
-
|
8
|
+
|
13
9
|
def head(url)
|
14
10
|
perform_request(:head, url)
|
15
11
|
end
|
16
|
-
|
12
|
+
|
17
13
|
private
|
18
|
-
|
14
|
+
|
19
15
|
def perform_request(method, url)
|
20
16
|
uri = URI.parse(url)
|
21
17
|
|
22
|
-
# Use http_proxy if set
|
23
18
|
proxy = URI.parse(ENV['http_proxy']) if ENV['http_proxy'] rescue nil
|
24
19
|
if proxy
|
25
20
|
http = Net::HTTP::Proxy(proxy.host, proxy.port).new(uri.host, uri.port)
|
@@ -28,16 +23,13 @@ module Rawler
|
|
28
23
|
end
|
29
24
|
http.use_ssl = (uri.scheme == 'https')
|
30
25
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
31
|
-
|
26
|
+
|
32
27
|
path = (uri.path.size == 0) ? "/" : uri.path
|
33
|
-
|
28
|
+
|
34
29
|
request = Net::HTTP::Get.new(path)
|
35
30
|
request.basic_auth(Rawler.username, Rawler.password)
|
36
31
|
http.request(request)
|
37
32
|
end
|
38
|
-
|
39
33
|
end
|
40
|
-
|
41
34
|
end
|
42
|
-
|
43
35
|
end
|
data/rawler.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "rawler"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Oscar Del Ben"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2012-02-27"
|
13
13
|
s.description = "Rawler is a tool that crawls the links of your website"
|
14
14
|
s.email = "info@oscardelben.com"
|
15
15
|
s.executables = ["rawler"]
|
@@ -45,7 +45,7 @@ Gem::Specification.new do |s|
|
|
45
45
|
s.homepage = "http://github.com/oscardelben/rawler"
|
46
46
|
s.licenses = ["MIT"]
|
47
47
|
s.require_paths = ["lib"]
|
48
|
-
s.rubygems_version = "1.8.
|
48
|
+
s.rubygems_version = "1.8.10"
|
49
49
|
s.summary = "Rawler is a tool that crawls the links of your website"
|
50
50
|
|
51
51
|
if s.respond_to? :specification_version then
|
@@ -53,12 +53,16 @@ Gem::Specification.new do |s|
|
|
53
53
|
|
54
54
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
55
|
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
56
|
+
s.add_development_dependency(%q<fakeweb>, [">= 0"])
|
57
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
56
58
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
57
59
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
60
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
59
61
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
60
62
|
else
|
61
63
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
64
|
+
s.add_dependency(%q<fakeweb>, [">= 0"])
|
65
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
62
66
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
67
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
68
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
@@ -66,6 +70,8 @@ Gem::Specification.new do |s|
|
|
66
70
|
end
|
67
71
|
else
|
68
72
|
s.add_dependency(%q<nokogiri>, [">= 0"])
|
73
|
+
s.add_dependency(%q<fakeweb>, [">= 0"])
|
74
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
69
75
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
70
76
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
71
77
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
@@ -159,6 +159,25 @@ describe Rawler::Crawler do
|
|
159
159
|
crawler.links
|
160
160
|
end
|
161
161
|
end
|
162
|
+
|
163
|
+
context "callto" do
|
164
|
+
let(:url) { 'http://example.com/path' }
|
165
|
+
let(:crawler) { Rawler::Crawler.new(url) }
|
166
|
+
let(:content) { "<a href=\"callto:home22\">foo</a><a name=\"foo\">" }
|
167
|
+
|
168
|
+
before(:each) do
|
169
|
+
register(url, content)
|
170
|
+
end
|
171
|
+
|
172
|
+
it "should return empty links" do
|
173
|
+
crawler.links.should == []
|
174
|
+
end
|
175
|
+
|
176
|
+
it "should not report the error" do
|
177
|
+
crawler.should_not_receive(:write)
|
178
|
+
crawler.links
|
179
|
+
end
|
180
|
+
end
|
162
181
|
|
163
182
|
end
|
164
183
|
|
data/spec/lib/rawler_spec.rb
CHANGED
@@ -20,6 +20,27 @@ describe Rawler::Base do
|
|
20
20
|
Rawler::Base.new(original, output)
|
21
21
|
Rawler.url.should == expected
|
22
22
|
end
|
23
|
+
|
24
|
+
it "should auto prepend http" do
|
25
|
+
original = 'example.com'
|
26
|
+
expected = 'http://example.com'
|
27
|
+
Rawler::Base.new(original, output)
|
28
|
+
Rawler.url.should == expected
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should not auto prepend http when already http" do
|
32
|
+
original = 'http://example.com'
|
33
|
+
expected = 'http://example.com'
|
34
|
+
Rawler::Base.new(original, output)
|
35
|
+
Rawler.url.should == expected
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should not auto prepend http when https" do
|
39
|
+
original = 'https://example.com'
|
40
|
+
expected = 'https://example.com'
|
41
|
+
Rawler::Base.new(original, output)
|
42
|
+
Rawler.url.should == expected
|
43
|
+
end
|
23
44
|
end
|
24
45
|
|
25
46
|
describe "validate_links" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Oscar Del Ben
|
@@ -15,10 +15,10 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-02-27 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
|
-
|
21
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
22
22
|
none: false
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
@@ -27,12 +27,26 @@ dependencies:
|
|
27
27
|
segments:
|
28
28
|
- 0
|
29
29
|
version: "0"
|
30
|
-
|
30
|
+
requirement: *id001
|
31
|
+
type: :runtime
|
32
|
+
prerelease: false
|
31
33
|
name: nokogiri
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
hash: 3
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
requirement: *id002
|
45
|
+
type: :development
|
32
46
|
prerelease: false
|
33
|
-
|
47
|
+
name: fakeweb
|
34
48
|
- !ruby/object:Gem::Dependency
|
35
|
-
|
49
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
36
50
|
none: false
|
37
51
|
requirements:
|
38
52
|
- - ">="
|
@@ -41,12 +55,26 @@ dependencies:
|
|
41
55
|
segments:
|
42
56
|
- 0
|
43
57
|
version: "0"
|
44
|
-
|
45
|
-
|
58
|
+
requirement: *id003
|
59
|
+
type: :development
|
46
60
|
prerelease: false
|
61
|
+
name: rspec
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
hash: 3
|
69
|
+
segments:
|
70
|
+
- 0
|
71
|
+
version: "0"
|
72
|
+
requirement: *id004
|
47
73
|
type: :development
|
74
|
+
prerelease: false
|
75
|
+
name: shoulda
|
48
76
|
- !ruby/object:Gem::Dependency
|
49
|
-
|
77
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
50
78
|
none: false
|
51
79
|
requirements:
|
52
80
|
- - ~>
|
@@ -57,12 +85,12 @@ dependencies:
|
|
57
85
|
- 0
|
58
86
|
- 0
|
59
87
|
version: 1.0.0
|
60
|
-
|
61
|
-
name: bundler
|
62
|
-
prerelease: false
|
88
|
+
requirement: *id005
|
63
89
|
type: :development
|
90
|
+
prerelease: false
|
91
|
+
name: bundler
|
64
92
|
- !ruby/object:Gem::Dependency
|
65
|
-
|
93
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
66
94
|
none: false
|
67
95
|
requirements:
|
68
96
|
- - ~>
|
@@ -73,12 +101,12 @@ dependencies:
|
|
73
101
|
- 6
|
74
102
|
- 4
|
75
103
|
version: 1.6.4
|
76
|
-
|
77
|
-
name: jeweler
|
78
|
-
prerelease: false
|
104
|
+
requirement: *id006
|
79
105
|
type: :development
|
106
|
+
prerelease: false
|
107
|
+
name: jeweler
|
80
108
|
- !ruby/object:Gem::Dependency
|
81
|
-
|
109
|
+
version_requirements: &id007 !ruby/object:Gem::Requirement
|
82
110
|
none: false
|
83
111
|
requirements:
|
84
112
|
- - ">="
|
@@ -87,10 +115,10 @@ dependencies:
|
|
87
115
|
segments:
|
88
116
|
- 0
|
89
117
|
version: "0"
|
90
|
-
|
91
|
-
name: rcov
|
92
|
-
prerelease: false
|
118
|
+
requirement: *id007
|
93
119
|
type: :development
|
120
|
+
prerelease: false
|
121
|
+
name: rcov
|
94
122
|
description: Rawler is a tool that crawls the links of your website
|
95
123
|
email: info@oscardelben.com
|
96
124
|
executables:
|
@@ -153,7 +181,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
153
181
|
requirements: []
|
154
182
|
|
155
183
|
rubyforge_project:
|
156
|
-
rubygems_version: 1.8.
|
184
|
+
rubygems_version: 1.8.10
|
157
185
|
signing_key:
|
158
186
|
specification_version: 3
|
159
187
|
summary: Rawler is a tool that crawls the links of your website
|