site_mapper 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_mapper.rb +8 -3
- data/lib/site_mapper/crawl_url.rb +3 -1
- data/lib/site_mapper/crawler.rb +20 -9
- data/lib/site_mapper/request.rb +18 -3
- data/lib/site_mapper/robots.rb +38 -11
- data/lib/site_mapper/version.rb +2 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2f7c3a2410e0cb508297f6fb229c1e5ec5e8f98
|
4
|
+
data.tar.gz: 6495918729c0e0d4c8788c4eb126c829b72dc17b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 80fceb2fd976b3c0e6ffe2f7237ffe9645257a271b99ca7f919e147a132fd09275862ba8c0563924e533e6b5049cd192726a06c48a8c4a4efa1b61b0af2c1e55
|
7
|
+
data.tar.gz: 9ff4ef13486b564bc9f8b3d60c42c248c26ed4245584af3e05a0a907b41ac7bf6bfc44ab5ece3cee91216c240201990c65e0adfa20960b28a0b37f8f6eb6ad4e
|
data/lib/site_mapper.rb
CHANGED
@@ -9,12 +9,17 @@ require 'site_mapper/crawl_url'
|
|
9
9
|
|
10
10
|
# Find all links on domain to domain
|
11
11
|
module SiteMapper
|
12
|
-
#
|
12
|
+
# SiteMapper info link
|
13
|
+
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
14
|
+
# SiteMapper User-Agent
|
15
|
+
USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
|
16
|
+
|
17
|
+
# Map all links on a given site.
|
13
18
|
# @return [Array] with links.
|
14
19
|
# @param [String] link to domain
|
15
20
|
# @example Collect all URLs from example.com
|
16
21
|
# SiteMapper.map('example.com')
|
17
|
-
def self.map(
|
18
|
-
Crawler.collect_urls(
|
22
|
+
def self.map(link)
|
23
|
+
Crawler.collect_urls(link) { |url| yield(url) if block_given? }
|
19
24
|
end
|
20
25
|
end
|
@@ -3,6 +3,7 @@ module SiteMapper
|
|
3
3
|
class CrawlUrl
|
4
4
|
attr_reader :resolved_base_url, :base_hostname
|
5
5
|
|
6
|
+
# @param [String] base_url
|
6
7
|
def initialize(base_url)
|
7
8
|
@resolved_base_url = Request.resolve_url(base_url, with_query: false)
|
8
9
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
@@ -11,8 +12,9 @@ module SiteMapper
|
|
11
12
|
|
12
13
|
# Given a link it constructs the absolute path,
|
13
14
|
# if valid URL & URL has same domain as @resolved_base_url.
|
15
|
+
# @param [String] raw_url url found on page
|
16
|
+
# @param [String] get_url current page url
|
14
17
|
# @return [String] with absolute path to resource
|
15
|
-
# @param [String, String] raw_url from link element and current page URL
|
16
18
|
# @example Construct absolute URL for '/path', example.com
|
17
19
|
# cu = CrawlUrl.new('example.com')
|
18
20
|
# cu.absolute_url_from('/path', 'example.com/some/path')
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -2,19 +2,17 @@ require 'set'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
4
|
module SiteMapper
|
5
|
+
# Crawls a given site.
|
5
6
|
class Crawler
|
6
|
-
|
7
|
-
|
8
|
-
'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
|
-
}
|
10
|
-
|
7
|
+
# @param [String] url base url for crawler
|
8
|
+
# @param [Hash] resolve (optional false by default)
|
11
9
|
def initialize(url, resolve: false)
|
12
|
-
base_url = Request.resolve_url(url)
|
10
|
+
@base_url = Request.resolve_url(url)
|
13
11
|
@options = { resolve: resolve }
|
14
|
-
@crawl_url = CrawlUrl.new(base_url)
|
12
|
+
@crawl_url = CrawlUrl.new(@base_url)
|
15
13
|
@fetch_queue = CrawlQueue.new
|
16
14
|
@processed = Set.new
|
17
|
-
@robots =
|
15
|
+
@robots = nil
|
18
16
|
end
|
19
17
|
|
20
18
|
# @see #collect_urls
|
@@ -59,7 +57,14 @@ module SiteMapper
|
|
59
57
|
end
|
60
58
|
|
61
59
|
def eligible_for_queue?(url)
|
62
|
-
|
60
|
+
robots.allowed?(url) && !@processed.include?(url)
|
61
|
+
end
|
62
|
+
|
63
|
+
def robots
|
64
|
+
return @robots unless @robots.nil?
|
65
|
+
robots_body = Request.get_response_body("#{@base_url}/robots.txt")
|
66
|
+
@robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
|
67
|
+
@robots
|
63
68
|
end
|
64
69
|
|
65
70
|
def resolve(url)
|
@@ -67,12 +72,18 @@ module SiteMapper
|
|
67
72
|
end
|
68
73
|
end
|
69
74
|
|
75
|
+
# Queue of urls to be crawled.
|
70
76
|
class CrawlQueue
|
77
|
+
# @return [Set] that exends EnumerablePop module
|
71
78
|
def self.new
|
72
79
|
Set.new.extend(EnumerablePop)
|
73
80
|
end
|
74
81
|
|
82
|
+
# Add pop method when added to class.
|
83
|
+
# The class that extends this module need to implement #first and #delete.
|
75
84
|
module EnumerablePop
|
85
|
+
# Pop first element from list.
|
86
|
+
# @return [Object] the first object in the list or nil
|
76
87
|
def pop
|
77
88
|
first_element = first
|
78
89
|
delete(first_element)
|
data/lib/site_mapper/request.rb
CHANGED
@@ -3,16 +3,28 @@ require 'url_resolver' # TODO: Allow users to use any resolver
|
|
3
3
|
module SiteMapper
|
4
4
|
# Get webpage wrapper.
|
5
5
|
class Request
|
6
|
+
# Request info link
|
6
7
|
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
8
|
+
# Request User-Agent
|
7
9
|
USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
|
8
10
|
|
9
11
|
class << self
|
10
12
|
# Given an URL get it then parse it with Nokogiri::HTML.
|
13
|
+
# @param [String] url
|
11
14
|
# @return [Nokogiri::HTML] a nokogiri HTML object
|
12
15
|
def get_page(url)
|
13
16
|
Nokogiri::HTML(Request.get_response_body(url))
|
14
17
|
end
|
15
18
|
|
19
|
+
# Given an URL get the response.
|
20
|
+
# @param [String] url
|
21
|
+
# @param [Boolean] resolve (optional and false by default)
|
22
|
+
# @return [Net::HTTPOK] if response is successfull, raises error otherwise
|
23
|
+
# @example get example.com and resolve the URL
|
24
|
+
# Request.get_response('example.com', true)
|
25
|
+
# @example get example.com and do *not* resolve the URL
|
26
|
+
# Request.get_response('http://example.com')
|
27
|
+
# Request.get_response('http://example.com', false)
|
16
28
|
def get_response(url, resolve = false)
|
17
29
|
resolved_url = resolve ? resolve_url(url) : url
|
18
30
|
uri = URI.parse(resolved_url)
|
@@ -20,18 +32,20 @@ module SiteMapper
|
|
20
32
|
http.use_ssl = true if resolved_url.include?('https://')
|
21
33
|
|
22
34
|
request = Net::HTTP::Get.new(uri.request_uri)
|
23
|
-
request['User-Agent'] = USER_AGENT
|
35
|
+
request['User-Agent'] = SiteMapper::USER_AGENT
|
24
36
|
http.request(request)
|
25
37
|
end
|
26
38
|
|
27
|
-
# Get response body, rescues with nil if an exception is raised
|
39
|
+
# Get response body, rescues with nil if an exception is raised.
|
28
40
|
# @see #get_response
|
29
41
|
def get_response_body(*args)
|
30
42
|
get_response(*args).body rescue nil
|
31
43
|
end
|
32
44
|
|
33
|
-
# Resolve an URL string and follows redirects
|
45
|
+
# Resolve an URL string and follows redirects.
|
34
46
|
# if the URL can't be resolved the original URL is returned.
|
47
|
+
# @param [String] url
|
48
|
+
# @param [Boolean] with_query (optional and true by default)
|
35
49
|
# @return [String] a URL string that potentially is a redirected URL
|
36
50
|
# @example Resolve google.com
|
37
51
|
# resolve_url('google.com')
|
@@ -43,6 +57,7 @@ module SiteMapper
|
|
43
57
|
end
|
44
58
|
|
45
59
|
# Removes query string from URL string.
|
60
|
+
# @param [String] url
|
46
61
|
# @return [String] an URL string without query
|
47
62
|
# @example Removes query string
|
48
63
|
# remove_query('example.com/path?q=keyword')
|
data/lib/site_mapper/robots.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
2
2
|
module SiteMapper
|
3
3
|
# Provided a base URL it checks whether a given URL is
|
4
|
-
# allowed to be crawled according to /robots.txt
|
4
|
+
# allowed to be crawled according to /robots.txt.
|
5
5
|
# @see https://rubygems.org/gems/robots
|
6
6
|
class Robots
|
7
7
|
# Parses robots.txt
|
@@ -11,10 +11,12 @@ module SiteMapper
|
|
11
11
|
@disallows = {}
|
12
12
|
@allows = {}
|
13
13
|
@delays = {}
|
14
|
+
@sitemaps = []
|
14
15
|
parse(body)
|
15
16
|
end
|
16
17
|
|
17
18
|
# Parse robots.txt body.
|
19
|
+
# @param [String] body the webpage body HTML
|
18
20
|
def parse(body)
|
19
21
|
agent = /.*/
|
20
22
|
body = body || "User-agent: *\nAllow: /\n"
|
@@ -36,6 +38,8 @@ module SiteMapper
|
|
36
38
|
@disallows[agent] << to_regex(value)
|
37
39
|
when 'crawl-delay'
|
38
40
|
@delays[agent] = value.to_i
|
41
|
+
when 'sitemap'
|
42
|
+
@sitemaps << value
|
39
43
|
else
|
40
44
|
@other[key] ||= []
|
41
45
|
@other[key] << value
|
@@ -43,15 +47,20 @@ module SiteMapper
|
|
43
47
|
end
|
44
48
|
@parsed = true
|
45
49
|
end
|
46
|
-
|
50
|
+
|
51
|
+
# @param [URI] uri to be checked
|
52
|
+
# @param [String] user_agent to be checked
|
47
53
|
# @return [Boolean] true if uri is allowed to be crawled
|
48
54
|
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
49
55
|
# uri = URI.parse('http://www.google.com/googlesites')
|
50
|
-
# robots.allowed?(uri, 'SiteMapper')
|
56
|
+
# robots.allowed?(uri, 'SiteMapper')
|
57
|
+
# # => false (as of 2014-10-22)
|
51
58
|
def allowed?(uri, user_agent)
|
52
59
|
return true unless @parsed
|
53
60
|
allowed = true
|
54
61
|
path = uri.request_uri
|
62
|
+
|
63
|
+
user_agent.downcase!
|
55
64
|
|
56
65
|
@disallows.each do |key, value|
|
57
66
|
if user_agent =~ key
|
@@ -76,11 +85,24 @@ module SiteMapper
|
|
76
85
|
end
|
77
86
|
allowed
|
78
87
|
end
|
88
|
+
|
89
|
+
# @param [String] user_agent
|
90
|
+
# @return [Integer] crawl delay for user_agent
|
91
|
+
def crawl_delay(user_agent)
|
92
|
+
agent = user_agent.dup
|
93
|
+
agent = to_regex(agent.downcase) if user_agent.is_a?(String)
|
94
|
+
@delays[agent]
|
95
|
+
end
|
79
96
|
|
80
97
|
# @return [Hash] key/value pairs from robots.txt
|
81
98
|
def other_values
|
82
99
|
@other
|
83
100
|
end
|
101
|
+
|
102
|
+
# @return [Array] returns sitemaps defined in robots.txt
|
103
|
+
def sitemaps
|
104
|
+
@sitemaps
|
105
|
+
end
|
84
106
|
|
85
107
|
protected
|
86
108
|
|
@@ -92,12 +114,15 @@ module SiteMapper
|
|
92
114
|
end
|
93
115
|
end
|
94
116
|
|
95
|
-
|
117
|
+
# @param [String] url to fetch /robots.txt from
|
118
|
+
def initialize(robots_txt, hostname, user_agent)
|
119
|
+
@robots_txt = robots_txt
|
120
|
+
@hostname = hostname
|
96
121
|
@user_agent = user_agent
|
97
122
|
@parsed = {}
|
98
|
-
@robots_txt = Request.get_response_body("#{url}/robots.txt", true)
|
99
123
|
end
|
100
124
|
|
125
|
+
# @param [String, URI] uri String or URI to check
|
101
126
|
# @return [Boolean] true if uri is allowed to be crawled
|
102
127
|
# @example Check if http://www.google.com/googlesites is allowed to be crawled
|
103
128
|
# robots = Robots.new('google.com', 'SiteMapper')
|
@@ -116,22 +141,24 @@ module SiteMapper
|
|
116
141
|
# robots = Robots.new('google.com', 'SiteMapper')
|
117
142
|
# robots.sitemaps
|
118
143
|
def sitemaps
|
119
|
-
|
120
|
-
|
121
|
-
|
144
|
+
host = @hostname
|
145
|
+
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
146
|
+
@parsed[host].sitemaps
|
122
147
|
rescue
|
123
148
|
[]
|
124
149
|
end
|
125
150
|
|
151
|
+
# @param [String, URI] uri String or URI get other_values from
|
126
152
|
# @return [Hash] key/value pairs from robots.txt
|
127
153
|
# @example Get other values for google.com
|
128
154
|
# robots = Robots.new('google.com', 'SiteMapper')
|
129
155
|
# robots.other_values
|
130
|
-
def other_values
|
131
|
-
|
132
|
-
host = uri.host
|
156
|
+
def other_values
|
157
|
+
host = @hostname
|
133
158
|
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
|
134
159
|
@parsed[host].other_values
|
160
|
+
rescue
|
161
|
+
{}
|
135
162
|
end
|
136
163
|
|
137
164
|
private
|
data/lib/site_mapper/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|