site_mapper 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_mapper.rb +8 -3
- data/lib/site_mapper/crawl_url.rb +3 -1
- data/lib/site_mapper/crawler.rb +20 -9
- data/lib/site_mapper/request.rb +18 -3
- data/lib/site_mapper/robots.rb +38 -11
- data/lib/site_mapper/version.rb +2 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2f7c3a2410e0cb508297f6fb229c1e5ec5e8f98
|
4
|
+
data.tar.gz: 6495918729c0e0d4c8788c4eb126c829b72dc17b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 80fceb2fd976b3c0e6ffe2f7237ffe9645257a271b99ca7f919e147a132fd09275862ba8c0563924e533e6b5049cd192726a06c48a8c4a4efa1b61b0af2c1e55
|
7
|
+
data.tar.gz: 9ff4ef13486b564bc9f8b3d60c42c248c26ed4245584af3e05a0a907b41ac7bf6bfc44ab5ece3cee91216c240201990c65e0adfa20960b28a0b37f8f6eb6ad4e
|
data/lib/site_mapper.rb
CHANGED
@@ -9,12 +9,17 @@ require 'site_mapper/crawl_url'
|
|
9
9
|
|
10
10
|
# Find all links on domain to domain
|
11
11
|
module SiteMapper
|
12
|
-
#
|
12
|
+
# SiteMapper info link
|
13
|
+
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
14
|
+
# SiteMapper User-Agent
|
15
|
+
USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
|
16
|
+
|
17
|
+
# Map all links on a given site.
|
13
18
|
# @return [Array] with links.
|
14
19
|
# @param [String] link to domain
|
15
20
|
# @example Collect all URLs from example.com
|
16
21
|
# SiteMapper.map('example.com')
|
17
|
-
def self.map(
|
18
|
-
Crawler.collect_urls(
|
22
|
+
def self.map(link)
|
23
|
+
Crawler.collect_urls(link) { |url| yield(url) if block_given? }
|
19
24
|
end
|
20
25
|
end
|
@@ -3,6 +3,7 @@ module SiteMapper
|
|
3
3
|
class CrawlUrl
|
4
4
|
attr_reader :resolved_base_url, :base_hostname
|
5
5
|
|
6
|
+
# @param [String] base_url
|
6
7
|
def initialize(base_url)
|
7
8
|
@resolved_base_url = Request.resolve_url(base_url, with_query: false)
|
8
9
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
@@ -11,8 +12,9 @@ module SiteMapper
|
|
11
12
|
|
12
13
|
# Given a link it constructs the absolute path,
|
13
14
|
# if valid URL & URL has same domain as @resolved_base_url.
|
15
|
+
# @param [String] raw_url url found on page
|
16
|
+
# @param [String] get_url current page url
|
14
17
|
# @return [String] with absolute path to resource
|
15
|
-
# @param [String, String] raw_url from link element and current page URL
|
16
18
|
# @example Construct absolute URL for '/path', example.com
|
17
19
|
# cu = CrawlUrl.new('example.com')
|
18
20
|
# cu.absolute_url_from('/path', 'example.com/some/path')
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -2,19 +2,17 @@ require 'set'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
4
|
module SiteMapper
|
5
|
+
# Crawls a given site.
|
5
6
|
class Crawler
|
6
|
-
|
7
|
-
|
8
|
-
'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
|
-
}
|
10
|
-
|
7
|
+
# @param [String] url base url for crawler
|
8
|
+
# @param [Hash] resolve (optional false by default)
|
11
9
|
def initialize(url, resolve: false)
|
12
|
-
base_url = Request.resolve_url(url)
|
10
|
+
@base_url = Request.resolve_url(url)
|
13
11
|
@options = { resolve: resolve }
|
14
|
-
@crawl_url = CrawlUrl.new(base_url)
|
12
|
+
@crawl_url = CrawlUrl.new(@base_url)
|
15
13
|
@fetch_queue = CrawlQueue.new
|
16
14
|
@processed = Set.new
|
17
|
-
@robots =
|
15
|
+
@robots = nil
|
18
16
|
end
|
19
17
|
|
20
18
|
# @see #collect_urls
|
@@ -59,7 +57,14 @@ module SiteMapper
|
|
59
57
|
end
|
60
58
|
|
61
59
|
def eligible_for_queue?(url)
|
62
|
-
|
60
|
+
robots.allowed?(url) && !@processed.include?(url)
|
61
|
+
end
|
62
|
+
|
63
|
+
def robots
|
64
|
+
return @robots unless @robots.nil?
|
65
|
+
robots_body = Request.get_response_body("#{@base_url}/robots.txt")
|
66
|
+
@robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
|
67
|
+
@robots
|
63
68
|
end
|
64
69
|
|
65
70
|
def resolve(url)
|
@@ -67,12 +72,18 @@ module SiteMapper
|
|
67
72
|
end
|
68
73
|
end
|
69
74
|
|
75
|
+
# Queue of urls to be crawled.
|
70
76
|
class CrawlQueue
|
77
|
+
# @return [Set] that exends EnumerablePop module
|
71
78
|
def self.new
|
72
79
|
Set.new.extend(EnumerablePop)
|
73
80
|
end
|
74
81
|
|
82
|
+
# Add pop method when added to class.
|
83
|
+
# The class that extends this module need to implement #first and #delete.
|
75
84
|
module EnumerablePop
|
85
|
+
# Pop first element from list.
|
86
|
+
# @return [Object] the first object in the list or nil
|
76
87
|
def pop
|
77
88
|
first_element = first
|
78
89
|
delete(first_element)
|
data/lib/site_mapper/request.rb
CHANGED
@@ -3,16 +3,28 @@ require 'url_resolver' # TODO: Allow users to use any resolver
|
|
3
3
|
module SiteMapper
|
4
4
|
# Get webpage wrapper.
|
5
5
|
class Request
|
6
|
+
# Request info link
|
6
7
|
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
8
|
+
# Request User-Agent
|
7
9
|
USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
|
8
10
|
|
9
11
|
class << self
|
10
12
|
# Given an URL get it then parse it with Nokogiri::HTML.
|
13
|
+
# @param [String] url
|
11
14
|
# @return [Nokogiri::HTML] a nokogiri HTML object
|
12
15
|
def get_page(url)
|
13
16
|
Nokogiri::HTML(Request.get_response_body(url))
|
14
17
|
end
|
15
18
|
|
19
|
+
# Given an URL get the response.
|
20
|
+
# @param [String] url
|
21
|
+
# @param [Boolean] resolve (optional and false by default)
|
22
|
+
# @return [Net::HTTPOK] if response is successfull, raises error otherwise
|
23
|
+
# @example get example.com and resolve the URL
|
24
|
+
# Request.get_response('example.com', true)
|
25
|
+
# @example get example.com and do *not* resolve the URL
|
26
|
+
# Request.get_response('http://example.com')
|
27
|
+
# Request.get_response('http://example.com', false)
|
16
28
|
def get_response(url, resolve = false)
|
17
29
|
resolved_url = resolve ? resolve_url(url) : url
|
18
30
|
uri = URI.parse(resolved_url)
|
@@ -20,18 +32,20 @@ module SiteMapper
|
|
20
32
|
http.use_ssl = true if resolved_url.include?('https://')
|
21
33
|
|
22
34
|
request = Net::HTTP::Get.new(uri.request_uri)
|
23
|
-
request['User-Agent'] = USER_AGENT
|
35
|
+
request['User-Agent'] = SiteMapper::USER_AGENT
|
24
36
|
http.request(request)
|
25
37
|
end
|
26
38
|
|
27
|
-
# Get response body, rescues with nil if an exception is raised
|
39
|
+
# Get response body, rescues with nil if an exception is raised.
|
28
40
|
# @see #get_response
|
29
41
|
def get_response_body(*args)
|
30
42
|
get_response(*args).body rescue nil
|
31
43
|
end
|
32
44
|
|
33
|
-
# Resolve an URL string and follows redirects
|
45
|
+
# Resolve an URL string and follows redirects.
|
34
46
|
# if the URL can't be resolved the original URL is returned.
|
47
|
+
# @param [String] url
|
48
|
+
# @param [Boolean] with_query (optional and true by default)
|
35
49
|
# @return [String] a URL string that potentially is a redirected URL
|
36
50
|
# @example Resolve google.com
|
37
51
|
# resolve_url('google.com')
|
@@ -43,6 +57,7 @@ module SiteMapper
|
|
43
57
|
end
|
44
58
|
|
45
59
|
# Removes query string from URL string.
|
60
|
+
# @param [String] url
|
46
61
|
# @return [String] an URL string without query
|
47
62
|
# @example Removes query string
|
48
63
|
# remove_query('example.com/path?q=keyword')
|
data/lib/site_mapper/robots.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
2
2
|
module SiteMapper
|
3
3
|
# Provided a base URL it checks whether a given URL is
|
4
|
-
# allowed to be crawled according to /robots.txt
|
4
|
+
# allowed to be crawled according to /robots.txt.
|
5
5
|
# @see https://rubygems.org/gems/robots
|
6
6
|
class Robots
|
7
7
|
# Parses robots.txt
|
@@ -11,10 +11,12 @@ module SiteMapper
|
|
11
11
|
@disallows = {}
|
12
12
|
@allows = {}
|
13
13
|
@delays = {}
|
14
|
+
@sitemaps = []
|
14
15
|
parse(body)
|
15
16
|
end
|
16
17
|
|
17
18
|
# Parse robots.txt body.
|
19
|
+
# @param [String] body the webpage body HTML
|
18
20
|
def parse(body)
|
19
21
|
agent = /.*/
|
20
22
|
body = body || "User-agent: *\nAllow: /\n"
|
@@ -36,6 +38,8 @@ module SiteMapper
|
|
36
38
|
@disallows[agent] << to_regex(value)
|
37
39
|
when 'crawl-delay'
|
38
40
|
@delays[agent] = value.to_i
|
41
|
+
when 'sitemap'
|
42
|
+
@sitemaps << value
|
39
43
|
else
|
40
44
|
@other[key] ||= []
|
41
45
|
@other[key] << value
|
@@ -43,15 +47,20 @@ module SiteMapper
|
|
43
47
|
end
|
44
48
|
@parsed = true
|
45
49
|
end
|
46
|
-
|
50
|
+
|
51
|
+
# @param [URI] uri to be checked
|
52
|
+
# @param [String] user_agent to be checked
|
47
53
|
# @return [Boolean] true if uri is allowed to be crawled
|
48
54
|
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
49
55
|
# uri = URI.parse('http://www.google.com/googlesites')
|
50
|
-
# robots.allowed?(uri, 'SiteMapper')
|
56
|
+
# robots.allowed?(uri, 'SiteMapper')
|
57
|
+
# # => false (as of 2014-10-22)
|
51
58
|
def allowed?(uri, user_agent)
|
52
59
|
return true unless @parsed
|
53
60
|
allowed = true
|
54
61
|
path = uri.request_uri
|
62
|
+
|
63
|
+
user_agent.downcase!
|
55
64
|
|
56
65
|
@disallows.each do |key, value|
|
57
66
|
if user_agent =~ key
|
@@ -76,11 +85,24 @@ module SiteMapper
|
|
76
85
|
end
|
77
86
|
allowed
|
78
87
|
end
|
88
|
+
|
89
|
+
# @param [String] user_agent
|
90
|
+
# @return [Integer] crawl delay for user_agent
|
91
|
+
def crawl_delay(user_agent)
|
92
|
+
agent = user_agent.dup
|
93
|
+
agent = to_regex(agent.downcase) if user_agent.is_a?(String)
|
94
|
+
@delays[agent]
|
95
|
+
end
|
79
96
|
|
80
97
|
# @return [Hash] key/value pairs from robots.txt
|
81
98
|
def other_values
|
82
99
|
@other
|
83
100
|
end
|
101
|
+
|
102
|
+
# @return [Array] returns sitemaps defined in robots.txt
|
103
|
+
def sitemaps
|
104
|
+
@sitemaps
|
105
|
+
end
|
84
106
|
|
85
107
|
protected
|
86
108
|
|
@@ -92,12 +114,15 @@ module SiteMapper
|
|
92
114
|
end
|
93
115
|
end
|
94
116
|
|
95
|
-
|
117
|
+
# @param [String] url to fetch /robots.txt from
|
118
|
+
def initialize(robots_txt, hostname, user_agent)
|
119
|
+
@robots_txt = robots_txt
|
120
|
+
@hostname = hostname
|
96
121
|
@user_agent = user_agent
|
97
122
|
@parsed = {}
|
98
|
-
@robots_txt = Request.get_response_body("#{url}/robots.txt", true)
|
99
123
|
end
|
100
124
|
|
125
|
+
# @param [String, URI] uri String or URI to check
|
101
126
|
# @return [Boolean] true if uri is allowed to be crawled
|
102
127
|
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
103
128
|
# robots = Robots.new('google.com', 'SiteMapper')
|
@@ -116,22 +141,24 @@ module SiteMapper
|
|
116
141
|
# robots = Robots.new('google.com', 'SiteMapper')
|
117
142
|
# robots.sitemaps
|
118
143
|
def sitemaps
|
119
|
-
|
120
|
-
|
121
|
-
|
144
|
+
host = @hostname
|
145
|
+
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
146
|
+
@parsed[host].sitemaps
|
122
147
|
rescue
|
123
148
|
[]
|
124
149
|
end
|
125
150
|
|
151
|
+
# @param [String, URI] uri String or URI get other_values from
|
126
152
|
# @return [Hash] key/value pairs from robots.txt
|
127
153
|
# @example Get other values for google.com
|
128
154
|
# robots = Robots.new('google.com', 'SiteMapper')
|
129
155
|
# robots.other_values
|
130
|
-
def other_values
|
131
|
-
|
132
|
-
host = uri.host
|
156
|
+
def other_values
|
157
|
+
host = @hostname
|
133
158
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
134
159
|
@parsed[host].other_values
|
160
|
+
rescue
|
161
|
+
{}
|
135
162
|
end
|
136
163
|
|
137
164
|
private
|
data/lib/site_mapper/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|