site_mapper 0.0.10 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_mapper.rb +12 -3
- data/lib/site_mapper/crawl_url.rb +18 -53
- data/lib/site_mapper/crawler.rb +42 -15
- data/lib/site_mapper/logger.rb +11 -0
- data/lib/site_mapper/request.rb +28 -27
- data/lib/site_mapper/robots.rb +14 -16
- data/lib/site_mapper/version.rb +1 -1
- metadata +22 -22
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dc4b21c14dc15f1cc6df4b6406d12acf5cb821d9
|
|
4
|
+
data.tar.gz: 34ef0ab2fcd0a74bbcdd53d9e47681d6440f951d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1d4da1f2753dfb5f06ea577c02183efbf4cb919b783ce128f07b46f29b6af7a330cc01d839895ea4d4fb53cf68db2a6b5adccba47530a0278aaca9bfe1fa4c02
|
|
7
|
+
data.tar.gz: c32dde9478240d63b63d6e521e04f3c914322b544c79eee7ee25e1b2ea46b5ad2529ddc7d778d30fe2c10cde074f88a72c53458a814fc5d2ab74bc87288e63ed
|
data/lib/site_mapper.rb
CHANGED
|
@@ -20,10 +20,19 @@ module SiteMapper
|
|
|
20
20
|
# @param [String] link to domain
|
|
21
21
|
# @param [Hash] options hash
|
|
22
22
|
# @example Collect all URLs from example.com
|
|
23
|
-
#
|
|
23
|
+
# SiteMapper.map('example.com')
|
|
24
|
+
# @example Collect all URLs from example.com with custom User-agent
|
|
25
|
+
# SiteMapper.map('example.com', user_agent: 'MyUserAgent')
|
|
26
|
+
# @example Collect all URLs from example.com with custom logger class
|
|
27
|
+
# class MyLogger
|
|
28
|
+
# def self.log(msg); puts msg;end
|
|
29
|
+
# def self.err_log(msg); puts msg;end
|
|
30
|
+
# end
|
|
31
|
+
# SiteMapper.map('example.com', logger: MyLogger)
|
|
24
32
|
def self.map(link, options = {})
|
|
25
|
-
set_logger(options
|
|
26
|
-
|
|
33
|
+
set_logger(options.delete(:logger))
|
|
34
|
+
options = { user_agent: USER_AGENT }.merge(options)
|
|
35
|
+
Crawler.collect_urls(link, options) { |url| yield(url) if block_given? }
|
|
27
36
|
end
|
|
28
37
|
|
|
29
38
|
# Set logger.
|
|
@@ -1,85 +1,50 @@
|
|
|
1
1
|
module SiteMapper
|
|
2
2
|
# Crawl URL formatter.
|
|
3
3
|
class CrawlUrl
|
|
4
|
-
attr_reader :resolved_base_url
|
|
4
|
+
attr_reader :resolved_base_url
|
|
5
5
|
|
|
6
|
+
# Too many request error message
|
|
7
|
+
TOO_MANY_REQUEST_MSG = "You're being challenged with a 'too many requests' captcha"
|
|
8
|
+
|
|
9
|
+
# Initialize CrawlUrl
|
|
6
10
|
# @param [String] base_url
|
|
11
|
+
# @example Intitialize CrawlUrl with example.com as base_url
|
|
12
|
+
# CrawlUrl.new('example.com')
|
|
7
13
|
def initialize(base_url)
|
|
8
|
-
@resolved_base_url = Request.resolve_url(base_url
|
|
14
|
+
@resolved_base_url = Request.resolve_url(base_url) # "#{protocol}#{host}"
|
|
9
15
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
|
10
|
-
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
|
11
16
|
end
|
|
12
17
|
|
|
13
18
|
# Given a link it constructs the absolute path,
|
|
14
19
|
# if valid URL & URL has same domain as @resolved_base_url.
|
|
15
|
-
# @param [String]
|
|
16
|
-
# @param [String]
|
|
20
|
+
# @param [String] page_url url found on page
|
|
21
|
+
# @param [String] current_url current page url
|
|
17
22
|
# @return [String] with absolute path to resource
|
|
18
23
|
# @example Construct absolute URL for '/path', example.com
|
|
19
24
|
# cu = CrawlUrl.new('example.com')
|
|
20
25
|
# cu.absolute_url_from('/path', 'example.com/some/path')
|
|
21
26
|
# # => http://example.com/some/path
|
|
22
|
-
def absolute_url_from(
|
|
23
|
-
return
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
elsif same_domain?(raw_url, @resolved_base_url)
|
|
28
|
-
raw_url
|
|
29
|
-
else
|
|
30
|
-
nil
|
|
31
|
-
end
|
|
27
|
+
def absolute_url_from(page_url, current_url)
|
|
28
|
+
return unless eligible_url?(page_url)
|
|
29
|
+
parsed_uri = URI.join(current_url, page_url) rescue return
|
|
30
|
+
return unless parsed_uri.hostname == @base_hostname
|
|
31
|
+
parsed_uri.to_s
|
|
32
32
|
end
|
|
33
33
|
|
|
34
34
|
private
|
|
35
35
|
|
|
36
|
-
def url_from_relative(url, current_page_url)
|
|
37
|
-
if url.start_with?('/')
|
|
38
|
-
"#{without_path_suffix(resolved_base_url)}#{url}"
|
|
39
|
-
elsif url.start_with?('../')
|
|
40
|
-
"#{url_from_dotted_url(url, current_page_url)}"
|
|
41
|
-
else
|
|
42
|
-
"#{with_path_suffix(resolved_base_url)}#{url}"
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def url_from_dotted_url(url, current_page_url)
|
|
47
|
-
absolute_url = with_path_suffix(current_page_url.dup)
|
|
48
|
-
found_dots = without_path_suffix(url).scan('../').length
|
|
49
|
-
removed_dots = 0
|
|
50
|
-
max_levels = 4
|
|
51
|
-
while found_dots >= removed_dots && max_levels > removed_dots
|
|
52
|
-
index = absolute_url.rindex('/') or break
|
|
53
|
-
absolute_url = absolute_url[0..(index - 1)]
|
|
54
|
-
removed_dots += 1
|
|
55
|
-
end
|
|
56
|
-
"#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def with_path_suffix(passed_url)
|
|
60
|
-
url = passed_url.dup
|
|
61
|
-
url.end_with?('/') ? url : url << '/'
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def without_path_suffix(passed_url)
|
|
65
|
-
url = passed_url.dup
|
|
66
|
-
url.end_with?('/') ? url[0...(url.length - 1)] : url
|
|
67
|
-
end
|
|
68
|
-
|
|
69
36
|
def eligible_url?(href)
|
|
70
37
|
return false if href.nil? || href.empty?
|
|
71
38
|
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
|
|
72
39
|
dont_include = %w(/email-protection#)
|
|
73
|
-
|
|
40
|
+
err_include = %w(/sorry/IndexRedirect?)
|
|
41
|
+
dont_end = %w(.zip .rar .json .pdf .exe .dmg .pkg .dpkg .bat)
|
|
74
42
|
|
|
43
|
+
err_include.each { |pattern| fail TOO_MANY_REQUEST_MSG if href.include?(pattern) }
|
|
75
44
|
dont_start.each { |pattern| return false if href.start_with?(pattern) }
|
|
76
45
|
dont_include.each { |pattern| return false if href.include?(pattern) }
|
|
77
46
|
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
|
78
47
|
true
|
|
79
48
|
end
|
|
80
|
-
|
|
81
|
-
def same_domain?(first, second)
|
|
82
|
-
first.include?(second)
|
|
83
|
-
end
|
|
84
49
|
end
|
|
85
50
|
end
|
data/lib/site_mapper/crawler.rb
CHANGED
|
@@ -4,20 +4,39 @@ require 'nokogiri'
|
|
|
4
4
|
module SiteMapper
|
|
5
5
|
# Crawls a given site.
|
|
6
6
|
class Crawler
|
|
7
|
+
# Default options
|
|
8
|
+
OPTIONS = {
|
|
9
|
+
resolve: false,
|
|
10
|
+
sleep_length: 0.5,
|
|
11
|
+
max_requests: Float::INFINITY
|
|
12
|
+
}
|
|
13
|
+
|
|
7
14
|
# @param [String] url base url for crawler
|
|
8
15
|
# @param [Hash] options hash, resolve key (optional false by default)
|
|
16
|
+
# add user_agent key to specify custom User-agent
|
|
17
|
+
# @example Create crawler with custom User-agent
|
|
18
|
+
# Crawler.new('example.com', user_agent: 'MyUserAgent')
|
|
19
|
+
# @example Create crawler and resolve all urls
|
|
20
|
+
# Crawler.new('example.com', resolve: true)
|
|
21
|
+
# @example Create crawler and sleep 1 second between each request
|
|
22
|
+
# Crawler.new('example.com', sleep_length: 1)
|
|
23
|
+
# @example Create crawler and perform max 3 requests
|
|
24
|
+
# Crawler.new('example.com', max_requests: 3)
|
|
9
25
|
def initialize(url, options = {})
|
|
10
26
|
@base_url = Request.resolve_url(url)
|
|
11
|
-
@options =
|
|
27
|
+
@options = OPTIONS.dup.merge(options)
|
|
28
|
+
@user_agent = @options.fetch(:user_agent)
|
|
12
29
|
@crawl_url = CrawlUrl.new(@base_url)
|
|
13
30
|
@fetch_queue = CrawlQueue.new
|
|
14
31
|
@processed = Set.new
|
|
15
32
|
@robots = nil
|
|
16
33
|
end
|
|
17
34
|
|
|
35
|
+
# See documentation for the instance variant of this method.
|
|
36
|
+
# @return [Array] with links.
|
|
18
37
|
# @see #collect_urls
|
|
19
|
-
def self.collect_urls(
|
|
20
|
-
new(
|
|
38
|
+
def self.collect_urls(*args)
|
|
39
|
+
new(*args).collect_urls { |url| yield(url) }
|
|
21
40
|
end
|
|
22
41
|
|
|
23
42
|
# Collects all links on domain for domain.
|
|
@@ -32,13 +51,16 @@ module SiteMapper
|
|
|
32
51
|
# end
|
|
33
52
|
def collect_urls
|
|
34
53
|
@fetch_queue << @crawl_url.resolved_base_url
|
|
35
|
-
until @fetch_queue.empty?
|
|
54
|
+
until @fetch_queue.empty? || @processed.length >= @options[:max_requests]
|
|
36
55
|
url = @fetch_queue.pop
|
|
37
56
|
yield(url)
|
|
38
|
-
|
|
57
|
+
page_urls_for(url)
|
|
39
58
|
end
|
|
40
|
-
|
|
41
|
-
|
|
59
|
+
result = @processed + @fetch_queue
|
|
60
|
+
Logger.log "Crawling finished:"
|
|
61
|
+
Logger.log "Processed links: #{@processed.length}"
|
|
62
|
+
Logger.log "Found links: #{result.length}"
|
|
63
|
+
result.to_a
|
|
42
64
|
rescue Interrupt, IRB::Abort
|
|
43
65
|
Logger.err_log 'Crawl interrupted.'
|
|
44
66
|
@fetch_queue.to_a
|
|
@@ -46,12 +68,13 @@ module SiteMapper
|
|
|
46
68
|
|
|
47
69
|
private
|
|
48
70
|
|
|
49
|
-
def
|
|
50
|
-
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{
|
|
51
|
-
link_elements = Request.
|
|
52
|
-
|
|
71
|
+
def page_urls_for(current_url)
|
|
72
|
+
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{current_url}"
|
|
73
|
+
link_elements = Request.document(current_url, user_agent: @options[:user_agent]).css('a')
|
|
74
|
+
wait
|
|
75
|
+
@processed << current_url
|
|
53
76
|
link_elements.each do |page_link|
|
|
54
|
-
url = @crawl_url.absolute_url_from(page_link.attr('href'),
|
|
77
|
+
url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
|
|
55
78
|
@fetch_queue << url if url && eligible_for_queue?(resolve(url))
|
|
56
79
|
end
|
|
57
80
|
end
|
|
@@ -62,7 +85,7 @@ module SiteMapper
|
|
|
62
85
|
|
|
63
86
|
def robots
|
|
64
87
|
return @robots unless @robots.nil?
|
|
65
|
-
robots_body = Request.
|
|
88
|
+
robots_body = Request.response_body("#{@base_url}/robots.txt", user_agent: @options[:user_agent])
|
|
66
89
|
@robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
|
|
67
90
|
@robots
|
|
68
91
|
end
|
|
@@ -71,13 +94,17 @@ module SiteMapper
|
|
|
71
94
|
@options[:resolve] ? Request.resolve_url(url) : url
|
|
72
95
|
end
|
|
73
96
|
|
|
97
|
+
def wait
|
|
98
|
+
sleep @options[:sleep_length]
|
|
99
|
+
end
|
|
100
|
+
|
|
74
101
|
# Queue of urls to be crawled.
|
|
75
102
|
class CrawlQueue
|
|
76
103
|
# @return [Set] that exends EnumerablePop module
|
|
77
104
|
def self.new
|
|
78
105
|
Set.new.extend(EnumerablePop)
|
|
79
106
|
end
|
|
80
|
-
|
|
107
|
+
|
|
81
108
|
# Add pop method when added to class.
|
|
82
109
|
# The class that extends this module need to implement #first and #delete.
|
|
83
110
|
module EnumerablePop
|
|
@@ -90,5 +117,5 @@ module SiteMapper
|
|
|
90
117
|
end
|
|
91
118
|
end
|
|
92
119
|
end
|
|
93
|
-
end
|
|
120
|
+
end
|
|
94
121
|
end
|
data/lib/site_mapper/logger.rb
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
module SiteMapper
|
|
2
2
|
# Handles logging
|
|
3
3
|
class Logger
|
|
4
|
+
|
|
5
|
+
# Choose what logger to use by type.
|
|
6
|
+
# @return [Object] returns the appropiate logger.
|
|
4
7
|
# @param [Symbol] type of logger class to be used
|
|
5
8
|
def self.use_logger_type(type)
|
|
6
9
|
fail 'Logger already set' if defined?(@@log)
|
|
@@ -15,18 +18,22 @@ module SiteMapper
|
|
|
15
18
|
@@log
|
|
16
19
|
end
|
|
17
20
|
|
|
21
|
+
# Choose what logger to use.
|
|
22
|
+
# @return [Object] returns logger.
|
|
18
23
|
# @param [Class, #log, #err_log] logger a logger class
|
|
19
24
|
def self.use_logger(logger)
|
|
20
25
|
fail 'Logger already set' if defined?(@@log)
|
|
21
26
|
@@log = logger
|
|
22
27
|
end
|
|
23
28
|
|
|
29
|
+
# Send a message to the logger
|
|
24
30
|
# @param [String] msg to be logged
|
|
25
31
|
def self.log(msg)
|
|
26
32
|
@@log ||= use_logger_type(:default)
|
|
27
33
|
@@log.log(msg)
|
|
28
34
|
end
|
|
29
35
|
|
|
36
|
+
# Send an error message to the logger
|
|
30
37
|
# @param [String] err_msg to be logged
|
|
31
38
|
def self.err_log(err_msg)
|
|
32
39
|
@@log ||= use_logger_type(:default)
|
|
@@ -35,11 +42,13 @@ module SiteMapper
|
|
|
35
42
|
|
|
36
43
|
# Log to terminal.
|
|
37
44
|
module SystemOutLogger
|
|
45
|
+
# Log to STDOUT
|
|
38
46
|
# @param [String] msg to be logged to STDOUT
|
|
39
47
|
def self.log(msg)
|
|
40
48
|
STDOUT.puts(msg)
|
|
41
49
|
end
|
|
42
50
|
|
|
51
|
+
# Log to STDERR
|
|
43
52
|
# @param [String] msg to be logged to STDERR
|
|
44
53
|
def self.err_log(msg)
|
|
45
54
|
STDERR.puts("[ERROR] #{msg}")
|
|
@@ -48,8 +57,10 @@ module SiteMapper
|
|
|
48
57
|
|
|
49
58
|
# Don't log
|
|
50
59
|
module NilLogger
|
|
60
|
+
# Don't log
|
|
51
61
|
# @param [String] msg to be ignored
|
|
52
62
|
def self.log(msg);end
|
|
63
|
+
# Don't error log
|
|
53
64
|
# @param [String] msg to be ignored
|
|
54
65
|
def self.err_log(msg);end
|
|
55
66
|
end
|
data/lib/site_mapper/request.rb
CHANGED
|
@@ -11,61 +11,62 @@ module SiteMapper
|
|
|
11
11
|
class << self
|
|
12
12
|
# Given an URL get it then parse it with Nokogiri::HTML.
|
|
13
13
|
# @param [String] url
|
|
14
|
+
# @param [Hash] options
|
|
14
15
|
# @return [Nokogiri::HTML] a nokogiri HTML object
|
|
15
|
-
def
|
|
16
|
-
Nokogiri::HTML(Request.
|
|
16
|
+
def document(url, options = {})
|
|
17
|
+
Nokogiri::HTML(Request.response_body(url, options))
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# Given an URL get the response.
|
|
20
21
|
# @param [String] url
|
|
21
|
-
# @param [
|
|
22
|
+
# @param [Hash] options
|
|
22
23
|
# @return [Net::HTTPOK] if response is successfull, raises error otherwise
|
|
23
24
|
# @example get example.com and resolve the URL
|
|
24
|
-
# Request.
|
|
25
|
+
# Request.response('example.com', resolve: true)
|
|
25
26
|
# @example get example.com and do *not* resolve the URL
|
|
26
|
-
# Request.
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
# Request.response('http://example.com')
|
|
28
|
+
# @example get example.com and resolve the URL
|
|
29
|
+
# Request.response('http://example.com', resolve: true)
|
|
30
|
+
# @example get example.com and resolve the URL and use a custom User-Agent
|
|
31
|
+
# Request.response('http://example.com', resolve: true, user_agent: 'MyUserAgent')
|
|
32
|
+
def response(url, options = {})
|
|
33
|
+
options = {
|
|
34
|
+
resolve: false,
|
|
35
|
+
user_agent: SiteMapper::USER_AGENT
|
|
36
|
+
}.merge(options)
|
|
37
|
+
resolved_url = options[:resolve] ? resolve_url(url) : url
|
|
30
38
|
uri = URI.parse(resolved_url)
|
|
31
39
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
32
|
-
http.use_ssl = true if resolved_url.
|
|
40
|
+
http.use_ssl = true if resolved_url.start_with?('https://')
|
|
33
41
|
|
|
34
42
|
request = Net::HTTP::Get.new(uri.request_uri)
|
|
35
|
-
request['User-Agent'] =
|
|
43
|
+
request['User-Agent'] = options[:user_agent]
|
|
36
44
|
http.request(request)
|
|
37
45
|
end
|
|
38
46
|
|
|
39
47
|
# Get response body, rescues with nil if an exception is raised.
|
|
40
|
-
# @see Request#
|
|
41
|
-
def
|
|
42
|
-
|
|
48
|
+
# @see Request#response
|
|
49
|
+
def response_body(*args)
|
|
50
|
+
response(*args).body
|
|
43
51
|
end
|
|
44
52
|
|
|
45
53
|
# Resolve an URL string and follows redirects.
|
|
46
54
|
# if the URL can't be resolved the original URL is returned.
|
|
47
|
-
# @param [String] url
|
|
48
|
-
# @param [Hash] options hash, with_query key (optional and true by default)
|
|
55
|
+
# @param [String] url to resolve
|
|
49
56
|
# @return [String] a URL string that potentially is a redirected URL
|
|
50
57
|
# @example Resolve google.com
|
|
51
58
|
# resolve_url('google.com')
|
|
52
59
|
# # => 'https://www.google.com'
|
|
53
|
-
def resolve_url(url
|
|
54
|
-
options = { with_query: true }.merge(options)
|
|
60
|
+
def resolve_url(url)
|
|
55
61
|
resolved = UrlResolver.resolve(url)
|
|
56
|
-
resolved =
|
|
62
|
+
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
|
|
57
63
|
resolved
|
|
58
64
|
end
|
|
59
65
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# remove_query('example.com/path?q=keyword')
|
|
65
|
-
# # => 'example.com/path'
|
|
66
|
-
def remove_query(url)
|
|
67
|
-
index = url.index('?')
|
|
68
|
-
index.nil? ? url : url[0...index]
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def has_protocol?(url)
|
|
69
|
+
url.start_with?('https://') || url.start_with?('http://')
|
|
69
70
|
end
|
|
70
71
|
end
|
|
71
72
|
end
|
data/lib/site_mapper/robots.rb
CHANGED
|
@@ -6,6 +6,7 @@ module SiteMapper
|
|
|
6
6
|
class Robots
|
|
7
7
|
# Parses robots.txt
|
|
8
8
|
class ParsedRobots
|
|
9
|
+
# Initializes ParsedRobots
|
|
9
10
|
def initialize(body, user_agent)
|
|
10
11
|
@other = {}
|
|
11
12
|
@disallows = {}
|
|
@@ -61,7 +62,7 @@ module SiteMapper
|
|
|
61
62
|
path = uri.request_uri
|
|
62
63
|
|
|
63
64
|
user_agent.downcase!
|
|
64
|
-
|
|
65
|
+
|
|
65
66
|
@disallows.each do |key, value|
|
|
66
67
|
if user_agent =~ key
|
|
67
68
|
value.each do |rule|
|
|
@@ -71,9 +72,9 @@ module SiteMapper
|
|
|
71
72
|
end
|
|
72
73
|
end
|
|
73
74
|
end
|
|
74
|
-
|
|
75
|
+
|
|
75
76
|
@allows.each do |key, value|
|
|
76
|
-
unless allowed
|
|
77
|
+
unless allowed
|
|
77
78
|
if user_agent =~ key
|
|
78
79
|
value.each do |rule|
|
|
79
80
|
if path =~ rule
|
|
@@ -93,7 +94,8 @@ module SiteMapper
|
|
|
93
94
|
agent = to_regex(agent.downcase) if user_agent.is_a?(String)
|
|
94
95
|
@delays[agent]
|
|
95
96
|
end
|
|
96
|
-
|
|
97
|
+
|
|
98
|
+
# Return key/value paris with unknown meaning.
|
|
97
99
|
# @return [Hash] key/value pairs from robots.txt
|
|
98
100
|
def other_values
|
|
99
101
|
@other
|
|
@@ -103,9 +105,11 @@ module SiteMapper
|
|
|
103
105
|
def sitemaps
|
|
104
106
|
@sitemaps
|
|
105
107
|
end
|
|
106
|
-
|
|
108
|
+
|
|
107
109
|
protected
|
|
108
|
-
|
|
110
|
+
|
|
111
|
+
# @return [Regex] regex from pattern
|
|
112
|
+
# @param [String] pattern to compile to Regex
|
|
109
113
|
def to_regex(pattern)
|
|
110
114
|
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
|
111
115
|
pattern = Regexp.escape(pattern)
|
|
@@ -123,7 +127,7 @@ module SiteMapper
|
|
|
123
127
|
@user_agent = user_agent
|
|
124
128
|
@parsed = {}
|
|
125
129
|
end
|
|
126
|
-
|
|
130
|
+
|
|
127
131
|
# @param [String, URI] uri String or URI to check
|
|
128
132
|
# @return [Boolean] true if uri is allowed to be crawled
|
|
129
133
|
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
|
@@ -134,8 +138,6 @@ module SiteMapper
|
|
|
134
138
|
host = uri.host
|
|
135
139
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
|
136
140
|
@parsed[host].allowed?(uri, @user_agent)
|
|
137
|
-
rescue
|
|
138
|
-
true
|
|
139
141
|
end
|
|
140
142
|
|
|
141
143
|
# @return [Array] array of sitemaps defined in robots.txt
|
|
@@ -146,10 +148,8 @@ module SiteMapper
|
|
|
146
148
|
host = @hostname
|
|
147
149
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
|
148
150
|
@parsed[host].sitemaps
|
|
149
|
-
rescue
|
|
150
|
-
[]
|
|
151
151
|
end
|
|
152
|
-
|
|
152
|
+
|
|
153
153
|
# @param [String, URI] uri String or URI get other_values from
|
|
154
154
|
# @return [Hash] key/value pairs from robots.txt
|
|
155
155
|
# @example Get other values for google.com
|
|
@@ -159,12 +159,10 @@ module SiteMapper
|
|
|
159
159
|
host = @hostname
|
|
160
160
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
|
161
161
|
@parsed[host].other_values
|
|
162
|
-
rescue
|
|
163
|
-
{}
|
|
164
162
|
end
|
|
165
163
|
|
|
166
|
-
private
|
|
167
|
-
|
|
164
|
+
private
|
|
165
|
+
|
|
168
166
|
def to_uri(uri)
|
|
169
167
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
|
170
168
|
uri
|
data/lib/site_mapper/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,125 +1,125 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: site_mapper
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jacob Burenstam
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2015-04-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - ~>
|
|
17
|
+
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
19
|
version: '1.6'
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - ~>
|
|
24
|
+
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '1.6'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: url_resolver
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- - ~>
|
|
31
|
+
- - "~>"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
33
|
version: '0.1'
|
|
34
34
|
type: :runtime
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- - ~>
|
|
38
|
+
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0.1'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: bundler
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
|
-
- - ~>
|
|
45
|
+
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
47
|
version: '1.3'
|
|
48
48
|
type: :development
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
|
-
- - ~>
|
|
52
|
+
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '1.3'
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: rake
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
|
-
- - ~>
|
|
59
|
+
- - "~>"
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
61
|
version: '10.3'
|
|
62
62
|
type: :development
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
|
-
- - ~>
|
|
66
|
+
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '10.3'
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: rspec
|
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
|
-
- - ~>
|
|
73
|
+
- - "~>"
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
75
|
version: '3.1'
|
|
76
76
|
type: :development
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
|
-
- - ~>
|
|
80
|
+
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
82
|
version: '3.1'
|
|
83
83
|
- !ruby/object:Gem::Dependency
|
|
84
84
|
name: yard
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
|
-
- - ~>
|
|
87
|
+
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
89
|
version: '0.8'
|
|
90
90
|
type: :development
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
|
-
- - ~>
|
|
94
|
+
- - "~>"
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
96
|
version: '0.8'
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
98
|
name: redcarpet
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements:
|
|
101
|
-
- - ~>
|
|
101
|
+
- - "~>"
|
|
102
102
|
- !ruby/object:Gem::Version
|
|
103
103
|
version: '3.2'
|
|
104
104
|
type: :development
|
|
105
105
|
prerelease: false
|
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
107
|
requirements:
|
|
108
|
-
- - ~>
|
|
108
|
+
- - "~>"
|
|
109
109
|
- !ruby/object:Gem::Version
|
|
110
110
|
version: '3.2'
|
|
111
111
|
- !ruby/object:Gem::Dependency
|
|
112
112
|
name: coveralls
|
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
|
114
114
|
requirements:
|
|
115
|
-
- - ~>
|
|
115
|
+
- - "~>"
|
|
116
116
|
- !ruby/object:Gem::Version
|
|
117
117
|
version: '0.7'
|
|
118
118
|
type: :development
|
|
119
119
|
prerelease: false
|
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
|
121
121
|
requirements:
|
|
122
|
-
- - ~>
|
|
122
|
+
- - "~>"
|
|
123
123
|
- !ruby/object:Gem::Version
|
|
124
124
|
version: '0.7'
|
|
125
125
|
description: Map all links on a given site.
|
|
@@ -131,13 +131,13 @@ extensions: []
|
|
|
131
131
|
extra_rdoc_files: []
|
|
132
132
|
files:
|
|
133
133
|
- bin/site_mapper
|
|
134
|
+
- lib/site_mapper.rb
|
|
134
135
|
- lib/site_mapper/crawl_url.rb
|
|
135
136
|
- lib/site_mapper/crawler.rb
|
|
136
137
|
- lib/site_mapper/logger.rb
|
|
137
138
|
- lib/site_mapper/request.rb
|
|
138
139
|
- lib/site_mapper/robots.rb
|
|
139
140
|
- lib/site_mapper/version.rb
|
|
140
|
-
- lib/site_mapper.rb
|
|
141
141
|
homepage: https://github.com/buren/site_mapper
|
|
142
142
|
licenses:
|
|
143
143
|
- MIT
|
|
@@ -148,17 +148,17 @@ require_paths:
|
|
|
148
148
|
- lib
|
|
149
149
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
150
150
|
requirements:
|
|
151
|
-
- -
|
|
151
|
+
- - ">="
|
|
152
152
|
- !ruby/object:Gem::Version
|
|
153
153
|
version: 1.9.3
|
|
154
154
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
155
|
requirements:
|
|
156
|
-
- -
|
|
156
|
+
- - ">="
|
|
157
157
|
- !ruby/object:Gem::Version
|
|
158
158
|
version: '0'
|
|
159
159
|
requirements: []
|
|
160
160
|
rubyforge_project:
|
|
161
|
-
rubygems_version: 2.
|
|
161
|
+
rubygems_version: 2.2.2
|
|
162
162
|
signing_key:
|
|
163
163
|
specification_version: 4
|
|
164
164
|
summary: Map all links on a given site.
|