site_mapper 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +1 -1
- data/lib/site_mapper/crawler.rb +7 -14
- data/lib/site_mapper/logger.rb +2 -2
- data/lib/site_mapper/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4c94ba470563b948533b4f9f39023ea0bef89bd
|
4
|
+
data.tar.gz: ce8e91a137ee13825fdd1b5ef0fe1220d48d85a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 36171ef9626a3f905d2441582a95f81b36c2ac5cd068ea3015c32b6c540d8cebc50ba6f4ecec2ac9ea889b8d164c383d2f5cea86cc53401ea33e6183c0bdd1e2
|
7
|
+
data.tar.gz: cd9e6bd664c97b0f31dce8e7b56a2b99e92261f20e5c4876b37c89afbe8b616149284a2d579b8c9b15849e5fc0393ef97cf07d7c02e2120fa8b7a01456891f9c
|
@@ -11,7 +11,7 @@ module SiteMapper
|
|
11
11
|
# @example Intitialize CrawlUrl with example.com as base_url
|
12
12
|
# CrawlUrl.new('example.com')
|
13
13
|
def initialize(base_url)
|
14
|
-
@resolved_base_url = Request.resolve_url(base_url)
|
14
|
+
@resolved_base_url = Request.resolve_url(base_url)
|
15
15
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
16
16
|
end
|
17
17
|
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -6,18 +6,14 @@ module SiteMapper
|
|
6
6
|
class Crawler
|
7
7
|
# Default options
|
8
8
|
OPTIONS = {
|
9
|
-
resolve: false,
|
10
9
|
sleep_length: 0.5,
|
11
10
|
max_requests: Float::INFINITY
|
12
11
|
}
|
13
12
|
|
14
13
|
# @param [String] url base url for crawler
|
15
|
-
# @param [Hash] options hash
|
16
|
-
#
|
17
|
-
# @example Create crawler with custom User-agent
|
14
|
+
# @param [Hash] options hash
|
15
|
+
# @example Create crawler with custom User-Agent
|
18
16
|
# Crawler.new('example.com', user_agent: 'MyUserAgent')
|
19
|
-
# @example Create crawler and resolve all urls
|
20
|
-
# Crawler.new('example.com', resolve: true)
|
21
17
|
# @example Create crawler and sleep 1 second between each request
|
22
18
|
# Crawler.new('example.com', sleep_length: 1)
|
23
19
|
# @example Create crawler and perform max 3 requests
|
@@ -75,7 +71,7 @@ module SiteMapper
|
|
75
71
|
@processed << current_url
|
76
72
|
link_elements.each do |page_link|
|
77
73
|
url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
|
78
|
-
@fetch_queue << url if url && eligible_for_queue?(
|
74
|
+
@fetch_queue << url if url && eligible_for_queue?(url)
|
79
75
|
end
|
80
76
|
end
|
81
77
|
|
@@ -85,22 +81,19 @@ module SiteMapper
|
|
85
81
|
|
86
82
|
def robots
|
87
83
|
return @robots unless @robots.nil?
|
88
|
-
|
89
|
-
|
84
|
+
robots_url = URI.join(@base_url, '/robots.txt').to_s
|
85
|
+
robots_body = Request.response_body(robots_url, user_agent: @options[:user_agent])
|
86
|
+
@robots = Robots.new(robots_body, URI.parse(@base_url).host, @options[:user_agent])
|
90
87
|
@robots
|
91
88
|
end
|
92
89
|
|
93
|
-
def resolve(url)
|
94
|
-
@options[:resolve] ? Request.resolve_url(url) : url
|
95
|
-
end
|
96
|
-
|
97
90
|
def wait
|
98
91
|
sleep @options[:sleep_length]
|
99
92
|
end
|
100
93
|
|
101
94
|
# Queue of urls to be crawled.
|
102
95
|
class CrawlQueue
|
103
|
-
# @return [Set] that
|
96
|
+
# @return [Set] that extends EnumerablePop module
|
104
97
|
def self.new
|
105
98
|
Set.new.extend(EnumerablePop)
|
106
99
|
end
|
data/lib/site_mapper/logger.rb
CHANGED
@@ -45,13 +45,13 @@ module SiteMapper
|
|
45
45
|
# Log to STDOUT
|
46
46
|
# @param [String] msg to be logged to STDOUT
|
47
47
|
def self.log(msg)
|
48
|
-
|
48
|
+
$stdout.puts(msg)
|
49
49
|
end
|
50
50
|
|
51
51
|
# Log to STDERR
|
52
52
|
# @param [String] msg to be logged to STDERR
|
53
53
|
def self.err_log(msg)
|
54
|
-
|
54
|
+
$stderr.puts("[ERROR] #{msg}")
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
data/lib/site_mapper/version.rb
CHANGED