site_mapper 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +2 -2
- data/lib/site_mapper/crawler.rb +9 -7
- data/lib/site_mapper/request.rb +22 -2
- data/lib/site_mapper/robots.rb +144 -0
- data/lib/site_mapper/version.rb +1 -1
- data/lib/site_mapper.rb +1 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 472f6b099e7d4c2fe67862faf59b703c65e39a8c
|
4
|
+
data.tar.gz: ce4b315b256fdded26a12665f50d60dccf127ea3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 971616d2f3dd773e63be7d01259582099fd285538b52e04cbcc8055958d98fa57644d84e5546408401c390c6fc56ff70b9ecfc13b2c97bd15660a0a81dd98107
|
7
|
+
data.tar.gz: e27f9ae323e7a536696625071b30b9d046bbaced373e9ae798cd2083556c27bf228ff197a154a2291900b89fc4510264a4565ad1592a86a330f83989f2914a8f
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module SiteMapper
|
2
|
-
# Crawl URL formatter
|
2
|
+
# Crawl URL formatter.
|
3
3
|
class CrawlUrl
|
4
4
|
attr_reader :resolved_base_url, :base_hostname
|
5
5
|
|
@@ -11,7 +11,7 @@ module SiteMapper
|
|
11
11
|
|
12
12
|
# Given a link it constructs the absolute path,
|
13
13
|
# if valid URL & URL has same domain as @resolved_base_url.
|
14
|
-
# @return [String] with absolute path to resource
|
14
|
+
# @return [String] with absolute path to resource
|
15
15
|
# @param [String, String] raw_url from link element and current page URL
|
16
16
|
# @example Construct absolute URL for '/path', example.com
|
17
17
|
# cu = CrawlUrl.new('example.com')
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -8,12 +8,13 @@ module SiteMapper
|
|
8
8
|
'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
9
|
}
|
10
10
|
|
11
|
-
def initialize(url, resolve
|
11
|
+
def initialize(url, resolve: false)
|
12
12
|
base_url = Request.resolve_url(url)
|
13
13
|
@options = { resolve: resolve }
|
14
14
|
@crawl_url = CrawlUrl.new(base_url)
|
15
15
|
@fetch_queue = CrawlQueue.new
|
16
16
|
@processed = Set.new
|
17
|
+
@robots = Robots.new(base_url, HEADERS_HASH['User-Agent'])
|
17
18
|
end
|
18
19
|
|
19
20
|
# @see #collect_urls
|
@@ -21,7 +22,7 @@ module SiteMapper
|
|
21
22
|
new(base_url).collect_urls { |url| yield(url) }
|
22
23
|
end
|
23
24
|
|
24
|
-
# Collects all links on domain for domain
|
25
|
+
# Collects all links on domain for domain.
|
25
26
|
# @return [Array] with links.
|
26
27
|
# @example URLs for example.com
|
27
28
|
# crawler = Crawler.new('example.com')
|
@@ -52,14 +53,15 @@ module SiteMapper
|
|
52
53
|
link_elements = Request.get_page(get_url).css('a') rescue []
|
53
54
|
@processed << get_url
|
54
55
|
link_elements.each do |page_link|
|
55
|
-
|
56
|
-
if
|
57
|
-
url = resolve(absolute_url)
|
58
|
-
@fetch_queue << url unless @processed.include?(url)
|
59
|
-
end
|
56
|
+
url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
57
|
+
@fetch_queue << url if url && eligible_for_queue?(resolve(url))
|
60
58
|
end
|
61
59
|
end
|
62
60
|
|
61
|
+
def eligible_for_queue?(url)
|
62
|
+
@robots.allowed?(url) && !@processed.include?(url)
|
63
|
+
end
|
64
|
+
|
63
65
|
def resolve(url)
|
64
66
|
@options[:resolve] ? Request.resolve_url(url) : url
|
65
67
|
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
2
|
|
3
3
|
module SiteMapper
|
4
|
+
# Get webpage wrapper.
|
4
5
|
class Request
|
5
6
|
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
6
7
|
USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
|
7
8
|
|
8
9
|
class << self
|
9
|
-
|
10
|
-
|
10
|
+
# Given an URL get it then parse it with Nokogiri::HTML.
|
11
|
+
# @return [Nokogiri::HTML] a nokogiri HTML object
|
12
|
+
def get_page(url)
|
13
|
+
Nokogiri::HTML(Request.get_response_body(url))
|
11
14
|
end
|
12
15
|
|
13
16
|
def get_response(url, resolve = false)
|
@@ -21,12 +24,29 @@ module SiteMapper
|
|
21
24
|
http.request(request)
|
22
25
|
end
|
23
26
|
|
27
|
+
# Get response body, rescues with nil if an exception is raised
|
28
|
+
# @see #get_response
|
29
|
+
def get_response_body(*args)
|
30
|
+
get_response(*args).body rescue nil
|
31
|
+
end
|
32
|
+
|
33
|
+
# Resolve an URL string and follows redirects
|
34
|
+
# if the URL can't be resolved the original URL is returned.
|
35
|
+
# @return [String] a URL string that potentially is a redirected URL
|
36
|
+
# @example Resolve google.com
|
37
|
+
# resolve_url('google.com')
|
38
|
+
# # => 'https://www.google.com'
|
24
39
|
def resolve_url(url, with_query: true)
|
25
40
|
resolved = UrlResolver.resolve(url)
|
26
41
|
resolved = remove_query(resolved) unless with_query
|
27
42
|
resolved
|
28
43
|
end
|
29
44
|
|
45
|
+
# Removes query string from URL string.
|
46
|
+
# @return [String] an URL string without query
|
47
|
+
# @example Removes query string
|
48
|
+
# remove_query('example.com/path?q=keyword')
|
49
|
+
# # => 'example.com/path'
|
30
50
|
def remove_query(url)
|
31
51
|
index = url.index('?')
|
32
52
|
index.nil? ? url : url[0...index]
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
2
|
+
module SiteMapper
|
3
|
+
# Provided a base URL it checks whether a given URL is
|
4
|
+
# allowed to be crawled according to /robots.txt
|
5
|
+
# @see https://rubygems.org/gems/robots
|
6
|
+
class Robots
|
7
|
+
# Parses robots.txt
|
8
|
+
class ParsedRobots
|
9
|
+
def initialize(body, user_agent)
|
10
|
+
@other = {}
|
11
|
+
@disallows = {}
|
12
|
+
@allows = {}
|
13
|
+
@delays = {}
|
14
|
+
parse(body)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse robots.txt body.
|
18
|
+
def parse(body)
|
19
|
+
agent = /.*/
|
20
|
+
body = body || "User-agent: *\nAllow: /\n"
|
21
|
+
body = body.downcase
|
22
|
+
body.each_line.each do |line|
|
23
|
+
next if line =~ /^\s*(#.*|$)/
|
24
|
+
arr = line.split(':')
|
25
|
+
key = arr.shift
|
26
|
+
value = arr.join(':').strip
|
27
|
+
value.strip!
|
28
|
+
case key
|
29
|
+
when 'user-agent'
|
30
|
+
agent = to_regex(value)
|
31
|
+
when 'allow'
|
32
|
+
@allows[agent] ||= []
|
33
|
+
@allows[agent] << to_regex(value)
|
34
|
+
when 'disallow'
|
35
|
+
@disallows[agent] ||= []
|
36
|
+
@disallows[agent] << to_regex(value)
|
37
|
+
when 'crawl-delay'
|
38
|
+
@delays[agent] = value.to_i
|
39
|
+
else
|
40
|
+
@other[key] ||= []
|
41
|
+
@other[key] << value
|
42
|
+
end
|
43
|
+
end
|
44
|
+
@parsed = true
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [Boolean] true if uri is allowed to be crawled
|
48
|
+
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
49
|
+
# uri = URI.parse('http://www.google.com/googlesites')
|
50
|
+
# robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
|
51
|
+
def allowed?(uri, user_agent)
|
52
|
+
return true unless @parsed
|
53
|
+
allowed = true
|
54
|
+
path = uri.request_uri
|
55
|
+
|
56
|
+
@disallows.each do |key, value|
|
57
|
+
if user_agent =~ key
|
58
|
+
value.each do |rule|
|
59
|
+
if path =~ rule
|
60
|
+
allowed = false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
@allows.each do |key, value|
|
67
|
+
unless allowed
|
68
|
+
if user_agent =~ key
|
69
|
+
value.each do |rule|
|
70
|
+
if path =~ rule
|
71
|
+
allowed = true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
allowed
|
78
|
+
end
|
79
|
+
|
80
|
+
# @return [Hash] key/value pairs from robots.txt
|
81
|
+
def other_values
|
82
|
+
@other
|
83
|
+
end
|
84
|
+
|
85
|
+
protected
|
86
|
+
|
87
|
+
def to_regex(pattern)
|
88
|
+
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
89
|
+
pattern = Regexp.escape(pattern)
|
90
|
+
pattern.gsub!(Regexp.escape('*'), '.*')
|
91
|
+
Regexp.compile("^#{pattern}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def initialize(url, user_agent)
|
96
|
+
@user_agent = user_agent
|
97
|
+
@parsed = {}
|
98
|
+
@robots_txt = Request.get_response_body("#{url}/robots.txt", true)
|
99
|
+
end
|
100
|
+
|
101
|
+
# @return [Boolean] true if uri is allowed to be crawled
|
102
|
+
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
103
|
+
# robots = Robots.new('google.com', 'SiteMapper')
|
104
|
+
# robots.allowed?('http://www.google.com/googlesites') # => false (as of 2014-10-22)
|
105
|
+
def allowed?(uri)
|
106
|
+
uri = to_uri(uri)
|
107
|
+
host = uri.host
|
108
|
+
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
109
|
+
@parsed[host].allowed?(uri, @user_agent)
|
110
|
+
rescue
|
111
|
+
true
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Array] array of sitemaps defined in robots.txt
|
115
|
+
# @example Get sitemap for google.com
|
116
|
+
# robots = Robots.new('google.com', 'SiteMapper')
|
117
|
+
# robots.sitemaps
|
118
|
+
def sitemaps
|
119
|
+
uri = to_uri(uri)
|
120
|
+
values = other_values(uri.host)
|
121
|
+
values['sitemap'] or []
|
122
|
+
rescue
|
123
|
+
[]
|
124
|
+
end
|
125
|
+
|
126
|
+
# @return [Hash] key/value pairs from robots.txt
|
127
|
+
# @example Get other values for google.com
|
128
|
+
# robots = Robots.new('google.com', 'SiteMapper')
|
129
|
+
# robots.other_values
|
130
|
+
def other_values(uri)
|
131
|
+
uri = to_uri(uri)
|
132
|
+
host = uri.host
|
133
|
+
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
134
|
+
@parsed[host].other_values
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def to_uri(uri)
|
140
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
141
|
+
uri
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/site_mapper/version.rb
CHANGED
data/lib/site_mapper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
@@ -108,7 +108,7 @@ dependencies:
|
|
108
108
|
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0.7'
|
111
|
-
description:
|
111
|
+
description: Map all links on a given site.
|
112
112
|
email:
|
113
113
|
- burenstam@gmail.com
|
114
114
|
executables:
|
@@ -117,11 +117,12 @@ extensions: []
|
|
117
117
|
extra_rdoc_files: []
|
118
118
|
files:
|
119
119
|
- bin/site_mapper
|
120
|
-
- lib/site_mapper.rb
|
121
120
|
- lib/site_mapper/crawl_url.rb
|
122
121
|
- lib/site_mapper/crawler.rb
|
123
122
|
- lib/site_mapper/request.rb
|
123
|
+
- lib/site_mapper/robots.rb
|
124
124
|
- lib/site_mapper/version.rb
|
125
|
+
- lib/site_mapper.rb
|
125
126
|
homepage: https://github.com/buren/site_mapper
|
126
127
|
licenses:
|
127
128
|
- MIT
|
@@ -142,9 +143,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
143
|
version: '0'
|
143
144
|
requirements: []
|
144
145
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.
|
146
|
+
rubygems_version: 2.0.0
|
146
147
|
signing_key:
|
147
148
|
specification_version: 4
|
148
|
-
summary:
|
149
|
+
summary: Map all links on a given site.
|
149
150
|
test_files: []
|
150
151
|
has_rdoc:
|