site_mapper 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +2 -2
- data/lib/site_mapper/crawler.rb +9 -7
- data/lib/site_mapper/request.rb +22 -2
- data/lib/site_mapper/robots.rb +144 -0
- data/lib/site_mapper/version.rb +1 -1
- data/lib/site_mapper.rb +1 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 472f6b099e7d4c2fe67862faf59b703c65e39a8c
|
4
|
+
data.tar.gz: ce4b315b256fdded26a12665f50d60dccf127ea3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 971616d2f3dd773e63be7d01259582099fd285538b52e04cbcc8055958d98fa57644d84e5546408401c390c6fc56ff70b9ecfc13b2c97bd15660a0a81dd98107
|
7
|
+
data.tar.gz: e27f9ae323e7a536696625071b30b9d046bbaced373e9ae798cd2083556c27bf228ff197a154a2291900b89fc4510264a4565ad1592a86a330f83989f2914a8f
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module SiteMapper
|
2
|
-
# Crawl URL formatter
|
2
|
+
# Crawl URL formatter.
|
3
3
|
class CrawlUrl
|
4
4
|
attr_reader :resolved_base_url, :base_hostname
|
5
5
|
|
@@ -11,7 +11,7 @@ module SiteMapper
|
|
11
11
|
|
12
12
|
# Given a link it constructs the absolute path,
|
13
13
|
# if valid URL & URL has same domain as @resolved_base_url.
|
14
|
-
# @return [String] with absolute path to resource
|
14
|
+
# @return [String] with absolute path to resource
|
15
15
|
# @param [String, String] raw_url from link element and current page URL
|
16
16
|
# @example Construct absolute URL for '/path', example.com
|
17
17
|
# cu = CrawlUrl.new('example.com')
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -8,12 +8,13 @@ module SiteMapper
|
|
8
8
|
'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
9
|
}
|
10
10
|
|
11
|
-
def initialize(url, resolve
|
11
|
+
def initialize(url, resolve: false)
|
12
12
|
base_url = Request.resolve_url(url)
|
13
13
|
@options = { resolve: resolve }
|
14
14
|
@crawl_url = CrawlUrl.new(base_url)
|
15
15
|
@fetch_queue = CrawlQueue.new
|
16
16
|
@processed = Set.new
|
17
|
+
@robots = Robots.new(base_url, HEADERS_HASH['User-Agent'])
|
17
18
|
end
|
18
19
|
|
19
20
|
# @see #collect_urls
|
@@ -21,7 +22,7 @@ module SiteMapper
|
|
21
22
|
new(base_url).collect_urls { |url| yield(url) }
|
22
23
|
end
|
23
24
|
|
24
|
-
# Collects all links on domain for domain
|
25
|
+
# Collects all links on domain for domain.
|
25
26
|
# @return [Array] with links.
|
26
27
|
# @example URLs for example.com
|
27
28
|
# crawler = Crawler.new('example.com')
|
@@ -52,14 +53,15 @@ module SiteMapper
|
|
52
53
|
link_elements = Request.get_page(get_url).css('a') rescue []
|
53
54
|
@processed << get_url
|
54
55
|
link_elements.each do |page_link|
|
55
|
-
|
56
|
-
if
|
57
|
-
url = resolve(absolute_url)
|
58
|
-
@fetch_queue << url unless @processed.include?(url)
|
59
|
-
end
|
56
|
+
url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
57
|
+
@fetch_queue << url if url && eligible_for_queue?(resolve(url))
|
60
58
|
end
|
61
59
|
end
|
62
60
|
|
61
|
+
def eligible_for_queue?(url)
|
62
|
+
@robots.allowed?(url) && !@processed.include?(url)
|
63
|
+
end
|
64
|
+
|
63
65
|
def resolve(url)
|
64
66
|
@options[:resolve] ? Request.resolve_url(url) : url
|
65
67
|
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
2
|
|
3
3
|
module SiteMapper
|
4
|
+
# Get webpage wrapper.
|
4
5
|
class Request
|
5
6
|
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
6
7
|
USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
|
7
8
|
|
8
9
|
class << self
|
9
|
-
|
10
|
-
|
10
|
+
# Given an URL get it then parse it with Nokogiri::HTML.
|
11
|
+
# @return [Nokogiri::HTML] a nokogiri HTML object
|
12
|
+
def get_page(url)
|
13
|
+
Nokogiri::HTML(Request.get_response_body(url))
|
11
14
|
end
|
12
15
|
|
13
16
|
def get_response(url, resolve = false)
|
@@ -21,12 +24,29 @@ module SiteMapper
|
|
21
24
|
http.request(request)
|
22
25
|
end
|
23
26
|
|
27
|
+
# Get response body, rescues with nil if an exception is raised
|
28
|
+
# @see #get_response
|
29
|
+
def get_response_body(*args)
|
30
|
+
get_response(*args).body rescue nil
|
31
|
+
end
|
32
|
+
|
33
|
+
# Resolve an URL string and follows redirects
|
34
|
+
# if the URL can't be resolved the original URL is returned.
|
35
|
+
# @return [String] a URL string that potentially is a redirected URL
|
36
|
+
# @example Resolve google.com
|
37
|
+
# resolve_url('google.com')
|
38
|
+
# # => 'https://www.google.com'
|
24
39
|
def resolve_url(url, with_query: true)
|
25
40
|
resolved = UrlResolver.resolve(url)
|
26
41
|
resolved = remove_query(resolved) unless with_query
|
27
42
|
resolved
|
28
43
|
end
|
29
44
|
|
45
|
+
# Removes query string from URL string.
|
46
|
+
# @return [String] an URL string without query
|
47
|
+
# @example Removes query string
|
48
|
+
# remove_query('example.com/path?q=keyword')
|
49
|
+
# # => 'example.com/path'
|
30
50
|
def remove_query(url)
|
31
51
|
index = url.index('?')
|
32
52
|
index.nil? ? url : url[0...index]
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
2
|
+
module SiteMapper
|
3
|
+
# Provided a base URL it checks whether a given URL is
|
4
|
+
# allowed to be crawled according to /robots.txt
|
5
|
+
# @see https://rubygems.org/gems/robots
|
6
|
+
class Robots
|
7
|
+
# Parses robots.txt
|
8
|
+
class ParsedRobots
|
9
|
+
def initialize(body, user_agent)
|
10
|
+
@other = {}
|
11
|
+
@disallows = {}
|
12
|
+
@allows = {}
|
13
|
+
@delays = {}
|
14
|
+
parse(body)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse robots.txt body.
|
18
|
+
def parse(body)
|
19
|
+
agent = /.*/
|
20
|
+
body = body || "User-agent: *\nAllow: /\n"
|
21
|
+
body = body.downcase
|
22
|
+
body.each_line.each do |line|
|
23
|
+
next if line =~ /^\s*(#.*|$)/
|
24
|
+
arr = line.split(':')
|
25
|
+
key = arr.shift
|
26
|
+
value = arr.join(':').strip
|
27
|
+
value.strip!
|
28
|
+
case key
|
29
|
+
when 'user-agent'
|
30
|
+
agent = to_regex(value)
|
31
|
+
when 'allow'
|
32
|
+
@allows[agent] ||= []
|
33
|
+
@allows[agent] << to_regex(value)
|
34
|
+
when 'disallow'
|
35
|
+
@disallows[agent] ||= []
|
36
|
+
@disallows[agent] << to_regex(value)
|
37
|
+
when 'crawl-delay'
|
38
|
+
@delays[agent] = value.to_i
|
39
|
+
else
|
40
|
+
@other[key] ||= []
|
41
|
+
@other[key] << value
|
42
|
+
end
|
43
|
+
end
|
44
|
+
@parsed = true
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [Boolean] true if uri is allowed to be crawled
|
48
|
+
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
49
|
+
# uri = URI.parse('http://www.google.com/googlesites')
|
50
|
+
# robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
|
51
|
+
def allowed?(uri, user_agent)
|
52
|
+
return true unless @parsed
|
53
|
+
allowed = true
|
54
|
+
path = uri.request_uri
|
55
|
+
|
56
|
+
@disallows.each do |key, value|
|
57
|
+
if user_agent =~ key
|
58
|
+
value.each do |rule|
|
59
|
+
if path =~ rule
|
60
|
+
allowed = false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
@allows.each do |key, value|
|
67
|
+
unless allowed
|
68
|
+
if user_agent =~ key
|
69
|
+
value.each do |rule|
|
70
|
+
if path =~ rule
|
71
|
+
allowed = true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
allowed
|
78
|
+
end
|
79
|
+
|
80
|
+
# @return [Hash] key/value pairs from robots.txt
|
81
|
+
def other_values
|
82
|
+
@other
|
83
|
+
end
|
84
|
+
|
85
|
+
protected
|
86
|
+
|
87
|
+
def to_regex(pattern)
|
88
|
+
return /should-not-match-anything-123456789/ if pattern.strip.empty?
|
89
|
+
pattern = Regexp.escape(pattern)
|
90
|
+
pattern.gsub!(Regexp.escape('*'), '.*')
|
91
|
+
Regexp.compile("^#{pattern}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def initialize(url, user_agent)
|
96
|
+
@user_agent = user_agent
|
97
|
+
@parsed = {}
|
98
|
+
@robots_txt = Request.get_response_body("#{url}/robots.txt", true)
|
99
|
+
end
|
100
|
+
|
101
|
+
# @return [Boolean] true if uri is allowed to be crawled
|
102
|
+
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
103
|
+
# robots = Robots.new('google.com', 'SiteMapper')
|
104
|
+
# robots.allowed?('http://www.google.com/googlesites') # => false (as of 2014-10-22)
|
105
|
+
def allowed?(uri)
|
106
|
+
uri = to_uri(uri)
|
107
|
+
host = uri.host
|
108
|
+
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
109
|
+
@parsed[host].allowed?(uri, @user_agent)
|
110
|
+
rescue
|
111
|
+
true
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Array] array of sitemaps defined in robots.txt
|
115
|
+
# @example Get sitemap for google.com
|
116
|
+
# robots = Robots.new('google.com', 'SiteMapper')
|
117
|
+
# robots.sitemaps
|
118
|
+
def sitemaps
|
119
|
+
uri = to_uri(uri)
|
120
|
+
values = other_values(uri.host)
|
121
|
+
values['sitemap'] or []
|
122
|
+
rescue
|
123
|
+
[]
|
124
|
+
end
|
125
|
+
|
126
|
+
# @return [Hash] key/value pairs from robots.txt
|
127
|
+
# @example Get other values for google.com
|
128
|
+
# robots = Robots.new('google.com', 'SiteMapper')
|
129
|
+
# robots.other_values
|
130
|
+
def other_values(uri)
|
131
|
+
uri = to_uri(uri)
|
132
|
+
host = uri.host
|
133
|
+
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
134
|
+
@parsed[host].other_values
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def to_uri(uri)
|
140
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
141
|
+
uri
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/site_mapper/version.rb
CHANGED
data/lib/site_mapper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
@@ -108,7 +108,7 @@ dependencies:
|
|
108
108
|
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0.7'
|
111
|
-
description:
|
111
|
+
description: Map all links on a given site.
|
112
112
|
email:
|
113
113
|
- burenstam@gmail.com
|
114
114
|
executables:
|
@@ -117,11 +117,12 @@ extensions: []
|
|
117
117
|
extra_rdoc_files: []
|
118
118
|
files:
|
119
119
|
- bin/site_mapper
|
120
|
-
- lib/site_mapper.rb
|
121
120
|
- lib/site_mapper/crawl_url.rb
|
122
121
|
- lib/site_mapper/crawler.rb
|
123
122
|
- lib/site_mapper/request.rb
|
123
|
+
- lib/site_mapper/robots.rb
|
124
124
|
- lib/site_mapper/version.rb
|
125
|
+
- lib/site_mapper.rb
|
125
126
|
homepage: https://github.com/buren/site_mapper
|
126
127
|
licenses:
|
127
128
|
- MIT
|
@@ -142,9 +143,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
143
|
version: '0'
|
143
144
|
requirements: []
|
144
145
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.
|
146
|
+
rubygems_version: 2.0.0
|
146
147
|
signing_key:
|
147
148
|
specification_version: 4
|
148
|
-
summary:
|
149
|
+
summary: Map all links on a given site.
|
149
150
|
test_files: []
|
150
151
|
has_rdoc:
|