site_mapper 0.0.10 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_mapper.rb +12 -3
- data/lib/site_mapper/crawl_url.rb +18 -53
- data/lib/site_mapper/crawler.rb +42 -15
- data/lib/site_mapper/logger.rb +11 -0
- data/lib/site_mapper/request.rb +28 -27
- data/lib/site_mapper/robots.rb +14 -16
- data/lib/site_mapper/version.rb +1 -1
- metadata +22 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc4b21c14dc15f1cc6df4b6406d12acf5cb821d9
|
4
|
+
data.tar.gz: 34ef0ab2fcd0a74bbcdd53d9e47681d6440f951d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d4da1f2753dfb5f06ea577c02183efbf4cb919b783ce128f07b46f29b6af7a330cc01d839895ea4d4fb53cf68db2a6b5adccba47530a0278aaca9bfe1fa4c02
|
7
|
+
data.tar.gz: c32dde9478240d63b63d6e521e04f3c914322b544c79eee7ee25e1b2ea46b5ad2529ddc7d778d30fe2c10cde074f88a72c53458a814fc5d2ab74bc87288e63ed
|
data/lib/site_mapper.rb
CHANGED
@@ -20,10 +20,19 @@ module SiteMapper
|
|
20
20
|
# @param [String] link to domain
|
21
21
|
# @param [Hash] options hash
|
22
22
|
# @example Collect all URLs from example.com
|
23
|
-
#
|
23
|
+
# SiteMapper.map('example.com')
|
24
|
+
# @example Collect all URLs from example.com with custom User-agent
|
25
|
+
# SiteMapper.map('example.com', user_agent: 'MyUserAgent')
|
26
|
+
# @example Collect all URLs from example.com with custom logger class
|
27
|
+
# class MyLogger
|
28
|
+
# def self.log(msg); puts msg;end
|
29
|
+
# def self.err_log(msg); puts msg;end
|
30
|
+
# end
|
31
|
+
# SiteMapper.map('example.com', logger: MyLogger)
|
24
32
|
def self.map(link, options = {})
|
25
|
-
set_logger(options
|
26
|
-
|
33
|
+
set_logger(options.delete(:logger))
|
34
|
+
options = { user_agent: USER_AGENT }.merge(options)
|
35
|
+
Crawler.collect_urls(link, options) { |url| yield(url) if block_given? }
|
27
36
|
end
|
28
37
|
|
29
38
|
# Set logger.
|
@@ -1,85 +1,50 @@
|
|
1
1
|
module SiteMapper
|
2
2
|
# Crawl URL formatter.
|
3
3
|
class CrawlUrl
|
4
|
-
attr_reader :resolved_base_url
|
4
|
+
attr_reader :resolved_base_url
|
5
5
|
|
6
|
+
# Too many request error message
|
7
|
+
TOO_MANY_REQUEST_MSG = "You're being challenged with a 'too many requests' captcha"
|
8
|
+
|
9
|
+
# Initialize CrawlUrl
|
6
10
|
# @param [String] base_url
|
11
|
+
# @example Intitialize CrawlUrl with example.com as base_url
|
12
|
+
# CrawlUrl.new('example.com')
|
7
13
|
def initialize(base_url)
|
8
|
-
@resolved_base_url = Request.resolve_url(base_url
|
14
|
+
@resolved_base_url = Request.resolve_url(base_url) # "#{protocol}#{host}"
|
9
15
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
10
|
-
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
11
16
|
end
|
12
17
|
|
13
18
|
# Given a link it constructs the absolute path,
|
14
19
|
# if valid URL & URL has same domain as @resolved_base_url.
|
15
|
-
# @param [String]
|
16
|
-
# @param [String]
|
20
|
+
# @param [String] page_url url found on page
|
21
|
+
# @param [String] current_url current page url
|
17
22
|
# @return [String] with absolute path to resource
|
18
23
|
# @example Construct absolute URL for '/path', example.com
|
19
24
|
# cu = CrawlUrl.new('example.com')
|
20
25
|
# cu.absolute_url_from('/path', 'example.com/some/path')
|
21
26
|
# # => http://example.com/some/path
|
22
|
-
def absolute_url_from(
|
23
|
-
return
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
elsif same_domain?(raw_url, @resolved_base_url)
|
28
|
-
raw_url
|
29
|
-
else
|
30
|
-
nil
|
31
|
-
end
|
27
|
+
def absolute_url_from(page_url, current_url)
|
28
|
+
return unless eligible_url?(page_url)
|
29
|
+
parsed_uri = URI.join(current_url, page_url) rescue return
|
30
|
+
return unless parsed_uri.hostname == @base_hostname
|
31
|
+
parsed_uri.to_s
|
32
32
|
end
|
33
33
|
|
34
34
|
private
|
35
35
|
|
36
|
-
def url_from_relative(url, current_page_url)
|
37
|
-
if url.start_with?('/')
|
38
|
-
"#{without_path_suffix(resolved_base_url)}#{url}"
|
39
|
-
elsif url.start_with?('../')
|
40
|
-
"#{url_from_dotted_url(url, current_page_url)}"
|
41
|
-
else
|
42
|
-
"#{with_path_suffix(resolved_base_url)}#{url}"
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def url_from_dotted_url(url, current_page_url)
|
47
|
-
absolute_url = with_path_suffix(current_page_url.dup)
|
48
|
-
found_dots = without_path_suffix(url).scan('../').length
|
49
|
-
removed_dots = 0
|
50
|
-
max_levels = 4
|
51
|
-
while found_dots >= removed_dots && max_levels > removed_dots
|
52
|
-
index = absolute_url.rindex('/') or break
|
53
|
-
absolute_url = absolute_url[0..(index - 1)]
|
54
|
-
removed_dots += 1
|
55
|
-
end
|
56
|
-
"#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
|
57
|
-
end
|
58
|
-
|
59
|
-
def with_path_suffix(passed_url)
|
60
|
-
url = passed_url.dup
|
61
|
-
url.end_with?('/') ? url : url << '/'
|
62
|
-
end
|
63
|
-
|
64
|
-
def without_path_suffix(passed_url)
|
65
|
-
url = passed_url.dup
|
66
|
-
url.end_with?('/') ? url[0...(url.length - 1)] : url
|
67
|
-
end
|
68
|
-
|
69
36
|
def eligible_url?(href)
|
70
37
|
return false if href.nil? || href.empty?
|
71
38
|
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
|
72
39
|
dont_include = %w(/email-protection#)
|
73
|
-
|
40
|
+
err_include = %w(/sorry/IndexRedirect?)
|
41
|
+
dont_end = %w(.zip .rar .json .pdf .exe .dmg .pkg .dpkg .bat)
|
74
42
|
|
43
|
+
err_include.each { |pattern| fail TOO_MANY_REQUEST_MSG if href.include?(pattern) }
|
75
44
|
dont_start.each { |pattern| return false if href.start_with?(pattern) }
|
76
45
|
dont_include.each { |pattern| return false if href.include?(pattern) }
|
77
46
|
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
78
47
|
true
|
79
48
|
end
|
80
|
-
|
81
|
-
def same_domain?(first, second)
|
82
|
-
first.include?(second)
|
83
|
-
end
|
84
49
|
end
|
85
50
|
end
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -4,20 +4,39 @@ require 'nokogiri'
|
|
4
4
|
module SiteMapper
|
5
5
|
# Crawls a given site.
|
6
6
|
class Crawler
|
7
|
+
# Default options
|
8
|
+
OPTIONS = {
|
9
|
+
resolve: false,
|
10
|
+
sleep_length: 0.5,
|
11
|
+
max_requests: Float::INFINITY
|
12
|
+
}
|
13
|
+
|
7
14
|
# @param [String] url base url for crawler
|
8
15
|
# @param [Hash] options hash, resolve key (optional false by default)
|
16
|
+
# add user_agent key to specify custom User-agent
|
17
|
+
# @example Create crawler with custom User-agent
|
18
|
+
# Crawler.new('example.com', user_agent: 'MyUserAgent')
|
19
|
+
# @example Create crawler and resolve all urls
|
20
|
+
# Crawler.new('example.com', resolve: true)
|
21
|
+
# @example Create crawler and sleep 1 second between each request
|
22
|
+
# Crawler.new('example.com', sleep_length: 1)
|
23
|
+
# @example Create crawler and perform max 3 requests
|
24
|
+
# Crawler.new('example.com', max_requests: 3)
|
9
25
|
def initialize(url, options = {})
|
10
26
|
@base_url = Request.resolve_url(url)
|
11
|
-
@options =
|
27
|
+
@options = OPTIONS.dup.merge(options)
|
28
|
+
@user_agent = @options.fetch(:user_agent)
|
12
29
|
@crawl_url = CrawlUrl.new(@base_url)
|
13
30
|
@fetch_queue = CrawlQueue.new
|
14
31
|
@processed = Set.new
|
15
32
|
@robots = nil
|
16
33
|
end
|
17
34
|
|
35
|
+
# See documentation for the instance variant of this method.
|
36
|
+
# @return [Array] with links.
|
18
37
|
# @see #collect_urls
|
19
|
-
def self.collect_urls(
|
20
|
-
new(
|
38
|
+
def self.collect_urls(*args)
|
39
|
+
new(*args).collect_urls { |url| yield(url) }
|
21
40
|
end
|
22
41
|
|
23
42
|
# Collects all links on domain for domain.
|
@@ -32,13 +51,16 @@ module SiteMapper
|
|
32
51
|
# end
|
33
52
|
def collect_urls
|
34
53
|
@fetch_queue << @crawl_url.resolved_base_url
|
35
|
-
until @fetch_queue.empty?
|
54
|
+
until @fetch_queue.empty? || @processed.length >= @options[:max_requests]
|
36
55
|
url = @fetch_queue.pop
|
37
56
|
yield(url)
|
38
|
-
|
57
|
+
page_urls_for(url)
|
39
58
|
end
|
40
|
-
|
41
|
-
|
59
|
+
result = @processed + @fetch_queue
|
60
|
+
Logger.log "Crawling finished:"
|
61
|
+
Logger.log "Processed links: #{@processed.length}"
|
62
|
+
Logger.log "Found links: #{result.length}"
|
63
|
+
result.to_a
|
42
64
|
rescue Interrupt, IRB::Abort
|
43
65
|
Logger.err_log 'Crawl interrupted.'
|
44
66
|
@fetch_queue.to_a
|
@@ -46,12 +68,13 @@ module SiteMapper
|
|
46
68
|
|
47
69
|
private
|
48
70
|
|
49
|
-
def
|
50
|
-
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{
|
51
|
-
link_elements = Request.
|
52
|
-
|
71
|
+
def page_urls_for(current_url)
|
72
|
+
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{current_url}"
|
73
|
+
link_elements = Request.document(current_url, user_agent: @options[:user_agent]).css('a')
|
74
|
+
wait
|
75
|
+
@processed << current_url
|
53
76
|
link_elements.each do |page_link|
|
54
|
-
url = @crawl_url.absolute_url_from(page_link.attr('href'),
|
77
|
+
url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
|
55
78
|
@fetch_queue << url if url && eligible_for_queue?(resolve(url))
|
56
79
|
end
|
57
80
|
end
|
@@ -62,7 +85,7 @@ module SiteMapper
|
|
62
85
|
|
63
86
|
def robots
|
64
87
|
return @robots unless @robots.nil?
|
65
|
-
robots_body = Request.
|
88
|
+
robots_body = Request.response_body("#{@base_url}/robots.txt", user_agent: @options[:user_agent])
|
66
89
|
@robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
|
67
90
|
@robots
|
68
91
|
end
|
@@ -71,13 +94,17 @@ module SiteMapper
|
|
71
94
|
@options[:resolve] ? Request.resolve_url(url) : url
|
72
95
|
end
|
73
96
|
|
97
|
+
def wait
|
98
|
+
sleep @options[:sleep_length]
|
99
|
+
end
|
100
|
+
|
74
101
|
# Queue of urls to be crawled.
|
75
102
|
class CrawlQueue
|
76
103
|
# @return [Set] that exends EnumerablePop module
|
77
104
|
def self.new
|
78
105
|
Set.new.extend(EnumerablePop)
|
79
106
|
end
|
80
|
-
|
107
|
+
|
81
108
|
# Add pop method when added to class.
|
82
109
|
# The class that extends this module need to implement #first and #delete.
|
83
110
|
module EnumerablePop
|
@@ -90,5 +117,5 @@ module SiteMapper
|
|
90
117
|
end
|
91
118
|
end
|
92
119
|
end
|
93
|
-
end
|
120
|
+
end
|
94
121
|
end
|
data/lib/site_mapper/logger.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module SiteMapper
|
2
2
|
# Handles logging
|
3
3
|
class Logger
|
4
|
+
|
5
|
+
# Choose what logger to use by type.
|
6
|
+
# @return [Object] returns the appropiate logger.
|
4
7
|
# @param [Symbol] type of logger class to be used
|
5
8
|
def self.use_logger_type(type)
|
6
9
|
fail 'Logger already set' if defined?(@@log)
|
@@ -15,18 +18,22 @@ module SiteMapper
|
|
15
18
|
@@log
|
16
19
|
end
|
17
20
|
|
21
|
+
# Choose what logger to use.
|
22
|
+
# @return [Object] returns logger.
|
18
23
|
# @param [Class, #log, #err_log] logger a logger class
|
19
24
|
def self.use_logger(logger)
|
20
25
|
fail 'Logger already set' if defined?(@@log)
|
21
26
|
@@log = logger
|
22
27
|
end
|
23
28
|
|
29
|
+
# Send a message to the logger
|
24
30
|
# @param [String] msg to be logged
|
25
31
|
def self.log(msg)
|
26
32
|
@@log ||= use_logger_type(:default)
|
27
33
|
@@log.log(msg)
|
28
34
|
end
|
29
35
|
|
36
|
+
# Send an error message to the logger
|
30
37
|
# @param [String] err_msg to be logged
|
31
38
|
def self.err_log(err_msg)
|
32
39
|
@@log ||= use_logger_type(:default)
|
@@ -35,11 +42,13 @@ module SiteMapper
|
|
35
42
|
|
36
43
|
# Log to terminal.
|
37
44
|
module SystemOutLogger
|
45
|
+
# Log to STDOUT
|
38
46
|
# @param [String] msg to be logged to STDOUT
|
39
47
|
def self.log(msg)
|
40
48
|
STDOUT.puts(msg)
|
41
49
|
end
|
42
50
|
|
51
|
+
# Log to STDERR
|
43
52
|
# @param [String] msg to be logged to STDERR
|
44
53
|
def self.err_log(msg)
|
45
54
|
STDERR.puts("[ERROR] #{msg}")
|
@@ -48,8 +57,10 @@ module SiteMapper
|
|
48
57
|
|
49
58
|
# Don't log
|
50
59
|
module NilLogger
|
60
|
+
# Don't log
|
51
61
|
# @param [String] msg to be ignored
|
52
62
|
def self.log(msg);end
|
63
|
+
# Don't error log
|
53
64
|
# @param [String] msg to be ignored
|
54
65
|
def self.err_log(msg);end
|
55
66
|
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -11,61 +11,62 @@ module SiteMapper
|
|
11
11
|
class << self
|
12
12
|
# Given an URL get it then parse it with Nokogiri::HTML.
|
13
13
|
# @param [String] url
|
14
|
+
# @param [Hash] options
|
14
15
|
# @return [Nokogiri::HTML] a nokogiri HTML object
|
15
|
-
def
|
16
|
-
Nokogiri::HTML(Request.
|
16
|
+
def document(url, options = {})
|
17
|
+
Nokogiri::HTML(Request.response_body(url, options))
|
17
18
|
end
|
18
19
|
|
19
20
|
# Given an URL get the response.
|
20
21
|
# @param [String] url
|
21
|
-
# @param [
|
22
|
+
# @param [Hash] options
|
22
23
|
# @return [Net::HTTPOK] if response is successfull, raises error otherwise
|
23
24
|
# @example get example.com and resolve the URL
|
24
|
-
# Request.
|
25
|
+
# Request.response('example.com', resolve: true)
|
25
26
|
# @example get example.com and do *not* resolve the URL
|
26
|
-
# Request.
|
27
|
-
#
|
28
|
-
|
29
|
-
|
27
|
+
# Request.response('http://example.com')
|
28
|
+
# @example get example.com and resolve the URL
|
29
|
+
# Request.response('http://example.com', resolve: true)
|
30
|
+
# @example get example.com and resolve the URL and use a custom User-Agent
|
31
|
+
# Request.response('http://example.com', resolve: true, user_agent: 'MyUserAgent')
|
32
|
+
def response(url, options = {})
|
33
|
+
options = {
|
34
|
+
resolve: false,
|
35
|
+
user_agent: SiteMapper::USER_AGENT
|
36
|
+
}.merge(options)
|
37
|
+
resolved_url = options[:resolve] ? resolve_url(url) : url
|
30
38
|
uri = URI.parse(resolved_url)
|
31
39
|
http = Net::HTTP.new(uri.host, uri.port)
|
32
|
-
http.use_ssl = true if resolved_url.
|
40
|
+
http.use_ssl = true if resolved_url.start_with?('https://')
|
33
41
|
|
34
42
|
request = Net::HTTP::Get.new(uri.request_uri)
|
35
|
-
request['User-Agent'] =
|
43
|
+
request['User-Agent'] = options[:user_agent]
|
36
44
|
http.request(request)
|
37
45
|
end
|
38
46
|
|
39
47
|
# Get response body, rescues with nil if an exception is raised.
|
40
|
-
# @see Request#
|
41
|
-
def
|
42
|
-
|
48
|
+
# @see Request#response
|
49
|
+
def response_body(*args)
|
50
|
+
response(*args).body
|
43
51
|
end
|
44
52
|
|
45
53
|
# Resolve an URL string and follows redirects.
|
46
54
|
# if the URL can't be resolved the original URL is returned.
|
47
|
-
# @param [String] url
|
48
|
-
# @param [Hash] options hash, with_query key (optional and true by default)
|
55
|
+
# @param [String] url to resolve
|
49
56
|
# @return [String] a URL string that potentially is a redirected URL
|
50
57
|
# @example Resolve google.com
|
51
58
|
# resolve_url('google.com')
|
52
59
|
# # => 'https://www.google.com'
|
53
|
-
def resolve_url(url
|
54
|
-
options = { with_query: true }.merge(options)
|
60
|
+
def resolve_url(url)
|
55
61
|
resolved = UrlResolver.resolve(url)
|
56
|
-
resolved =
|
62
|
+
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
|
57
63
|
resolved
|
58
64
|
end
|
59
65
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
# remove_query('example.com/path?q=keyword')
|
65
|
-
# # => 'example.com/path'
|
66
|
-
def remove_query(url)
|
67
|
-
index = url.index('?')
|
68
|
-
index.nil? ? url : url[0...index]
|
66
|
+
private
|
67
|
+
|
68
|
+
def has_protocol?(url)
|
69
|
+
url.start_with?('https://') || url.start_with?('http://')
|
69
70
|
end
|
70
71
|
end
|
71
72
|
end
|
data/lib/site_mapper/robots.rb
CHANGED
@@ -6,6 +6,7 @@ module SiteMapper
|
|
6
6
|
class Robots
|
7
7
|
# Parses robots.txt
|
8
8
|
class ParsedRobots
|
9
|
+
# Initializes ParsedRobots
|
9
10
|
def initialize(body, user_agent)
|
10
11
|
@other = {}
|
11
12
|
@disallows = {}
|
@@ -61,7 +62,7 @@ module SiteMapper
|
|
61
62
|
path = uri.request_uri
|
62
63
|
|
63
64
|
user_agent.downcase!
|
64
|
-
|
65
|
+
|
65
66
|
@disallows.each do |key, value|
|
66
67
|
if user_agent =~ key
|
67
68
|
value.each do |rule|
|
@@ -71,9 +72,9 @@ module SiteMapper
|
|
71
72
|
end
|
72
73
|
end
|
73
74
|
end
|
74
|
-
|
75
|
+
|
75
76
|
@allows.each do |key, value|
|
76
|
-
unless allowed
|
77
|
+
unless allowed
|
77
78
|
if user_agent =~ key
|
78
79
|
value.each do |rule|
|
79
80
|
if path =~ rule
|
@@ -93,7 +94,8 @@ module SiteMapper
|
|
93
94
|
agent = to_regex(agent.downcase) if user_agent.is_a?(String)
|
94
95
|
@delays[agent]
|
95
96
|
end
|
96
|
-
|
97
|
+
|
98
|
+
# Return key/value paris with unknown meaning.
|
97
99
|
# @return [Hash] key/value pairs from robots.txt
|
98
100
|
def other_values
|
99
101
|
@other
|
@@ -103,9 +105,11 @@ module SiteMapper
|
|
103
105
|
def sitemaps
|
104
106
|
@sitemaps
|
105
107
|
end
|
106
|
-
|
108
|
+
|
107
109
|
protected
|
108
|
-
|
110
|
+
|
111
|
+
# @return [Regex] regex from pattern
|
112
|
+
# @param [String] pattern to compile to Regex
|
109
113
|
def to_regex(pattern)
|
110
114
|
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
111
115
|
pattern = Regexp.escape(pattern)
|
@@ -123,7 +127,7 @@ module SiteMapper
|
|
123
127
|
@user_agent = user_agent
|
124
128
|
@parsed = {}
|
125
129
|
end
|
126
|
-
|
130
|
+
|
127
131
|
# @param [String, URI] uri String or URI to check
|
128
132
|
# @return [Boolean] true if uri is allowed to be crawled
|
129
133
|
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
@@ -134,8 +138,6 @@ module SiteMapper
|
|
134
138
|
host = uri.host
|
135
139
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
136
140
|
@parsed[host].allowed?(uri, @user_agent)
|
137
|
-
rescue
|
138
|
-
true
|
139
141
|
end
|
140
142
|
|
141
143
|
# @return [Array] array of sitemaps defined in robots.txt
|
@@ -146,10 +148,8 @@ module SiteMapper
|
|
146
148
|
host = @hostname
|
147
149
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
148
150
|
@parsed[host].sitemaps
|
149
|
-
rescue
|
150
|
-
[]
|
151
151
|
end
|
152
|
-
|
152
|
+
|
153
153
|
# @param [String, URI] uri String or URI get other_values from
|
154
154
|
# @return [Hash] key/value pairs from robots.txt
|
155
155
|
# @example Get other values for google.com
|
@@ -159,12 +159,10 @@ module SiteMapper
|
|
159
159
|
host = @hostname
|
160
160
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
161
161
|
@parsed[host].other_values
|
162
|
-
rescue
|
163
|
-
{}
|
164
162
|
end
|
165
163
|
|
166
|
-
private
|
167
|
-
|
164
|
+
private
|
165
|
+
|
168
166
|
def to_uri(uri)
|
169
167
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
170
168
|
uri
|
data/lib/site_mapper/version.rb
CHANGED
metadata
CHANGED
@@ -1,125 +1,125 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: url_resolver
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: bundler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.3'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - ~>
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '10.3'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - ~>
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '10.3'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - ~>
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '3.1'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - ~>
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '3.1'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: yard
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - ~>
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0.8'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - ~>
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.8'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: redcarpet
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - ~>
|
101
|
+
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '3.2'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- - ~>
|
108
|
+
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '3.2'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: coveralls
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
|
-
- - ~>
|
115
|
+
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '0.7'
|
118
118
|
type: :development
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- - ~>
|
122
|
+
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0.7'
|
125
125
|
description: Map all links on a given site.
|
@@ -131,13 +131,13 @@ extensions: []
|
|
131
131
|
extra_rdoc_files: []
|
132
132
|
files:
|
133
133
|
- bin/site_mapper
|
134
|
+
- lib/site_mapper.rb
|
134
135
|
- lib/site_mapper/crawl_url.rb
|
135
136
|
- lib/site_mapper/crawler.rb
|
136
137
|
- lib/site_mapper/logger.rb
|
137
138
|
- lib/site_mapper/request.rb
|
138
139
|
- lib/site_mapper/robots.rb
|
139
140
|
- lib/site_mapper/version.rb
|
140
|
-
- lib/site_mapper.rb
|
141
141
|
homepage: https://github.com/buren/site_mapper
|
142
142
|
licenses:
|
143
143
|
- MIT
|
@@ -148,17 +148,17 @@ require_paths:
|
|
148
148
|
- lib
|
149
149
|
required_ruby_version: !ruby/object:Gem::Requirement
|
150
150
|
requirements:
|
151
|
-
- -
|
151
|
+
- - ">="
|
152
152
|
- !ruby/object:Gem::Version
|
153
153
|
version: 1.9.3
|
154
154
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
155
155
|
requirements:
|
156
|
-
- -
|
156
|
+
- - ">="
|
157
157
|
- !ruby/object:Gem::Version
|
158
158
|
version: '0'
|
159
159
|
requirements: []
|
160
160
|
rubyforge_project:
|
161
|
-
rubygems_version: 2.
|
161
|
+
rubygems_version: 2.2.2
|
162
162
|
signing_key:
|
163
163
|
specification_version: 4
|
164
164
|
summary: Map all links on a given site.
|