site_mapper 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +1 -1
- data/lib/site_mapper/crawler.rb +7 -14
- data/lib/site_mapper/logger.rb +2 -2
- data/lib/site_mapper/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f4c94ba470563b948533b4f9f39023ea0bef89bd
|
|
4
|
+
data.tar.gz: ce8e91a137ee13825fdd1b5ef0fe1220d48d85a1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 36171ef9626a3f905d2441582a95f81b36c2ac5cd068ea3015c32b6c540d8cebc50ba6f4ecec2ac9ea889b8d164c383d2f5cea86cc53401ea33e6183c0bdd1e2
|
|
7
|
+
data.tar.gz: cd9e6bd664c97b0f31dce8e7b56a2b99e92261f20e5c4876b37c89afbe8b616149284a2d579b8c9b15849e5fc0393ef97cf07d7c02e2120fa8b7a01456891f9c
|
|
@@ -11,7 +11,7 @@ module SiteMapper
|
|
|
11
11
|
# @example Intitialize CrawlUrl with example.com as base_url
|
|
12
12
|
# CrawlUrl.new('example.com')
|
|
13
13
|
def initialize(base_url)
|
|
14
|
-
@resolved_base_url = Request.resolve_url(base_url)
|
|
14
|
+
@resolved_base_url = Request.resolve_url(base_url)
|
|
15
15
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
|
16
16
|
end
|
|
17
17
|
|
data/lib/site_mapper/crawler.rb
CHANGED
|
@@ -6,18 +6,14 @@ module SiteMapper
|
|
|
6
6
|
class Crawler
|
|
7
7
|
# Default options
|
|
8
8
|
OPTIONS = {
|
|
9
|
-
resolve: false,
|
|
10
9
|
sleep_length: 0.5,
|
|
11
10
|
max_requests: Float::INFINITY
|
|
12
11
|
}
|
|
13
12
|
|
|
14
13
|
# @param [String] url base url for crawler
|
|
15
|
-
# @param [Hash] options hash
|
|
16
|
-
#
|
|
17
|
-
# @example Create crawler with custom User-agent
|
|
14
|
+
# @param [Hash] options hash
|
|
15
|
+
# @example Create crawler with custom User-Agent
|
|
18
16
|
# Crawler.new('example.com', user_agent: 'MyUserAgent')
|
|
19
|
-
# @example Create crawler and resolve all urls
|
|
20
|
-
# Crawler.new('example.com', resolve: true)
|
|
21
17
|
# @example Create crawler and sleep 1 second between each request
|
|
22
18
|
# Crawler.new('example.com', sleep_length: 1)
|
|
23
19
|
# @example Create crawler and perform max 3 requests
|
|
@@ -75,7 +71,7 @@ module SiteMapper
|
|
|
75
71
|
@processed << current_url
|
|
76
72
|
link_elements.each do |page_link|
|
|
77
73
|
url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
|
|
78
|
-
@fetch_queue << url if url && eligible_for_queue?(
|
|
74
|
+
@fetch_queue << url if url && eligible_for_queue?(url)
|
|
79
75
|
end
|
|
80
76
|
end
|
|
81
77
|
|
|
@@ -85,22 +81,19 @@ module SiteMapper
|
|
|
85
81
|
|
|
86
82
|
def robots
|
|
87
83
|
return @robots unless @robots.nil?
|
|
88
|
-
|
|
89
|
-
|
|
84
|
+
robots_url = URI.join(@base_url, '/robots.txt').to_s
|
|
85
|
+
robots_body = Request.response_body(robots_url, user_agent: @options[:user_agent])
|
|
86
|
+
@robots = Robots.new(robots_body, URI.parse(@base_url).host, @options[:user_agent])
|
|
90
87
|
@robots
|
|
91
88
|
end
|
|
92
89
|
|
|
93
|
-
def resolve(url)
|
|
94
|
-
@options[:resolve] ? Request.resolve_url(url) : url
|
|
95
|
-
end
|
|
96
|
-
|
|
97
90
|
def wait
|
|
98
91
|
sleep @options[:sleep_length]
|
|
99
92
|
end
|
|
100
93
|
|
|
101
94
|
# Queue of urls to be crawled.
|
|
102
95
|
class CrawlQueue
|
|
103
|
-
# @return [Set] that
|
|
96
|
+
# @return [Set] that extends EnumerablePop module
|
|
104
97
|
def self.new
|
|
105
98
|
Set.new.extend(EnumerablePop)
|
|
106
99
|
end
|
data/lib/site_mapper/logger.rb
CHANGED
|
@@ -45,13 +45,13 @@ module SiteMapper
|
|
|
45
45
|
# Log to STDOUT
|
|
46
46
|
# @param [String] msg to be logged to STDOUT
|
|
47
47
|
def self.log(msg)
|
|
48
|
-
|
|
48
|
+
$stdout.puts(msg)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
# Log to STDERR
|
|
52
52
|
# @param [String] msg to be logged to STDERR
|
|
53
53
|
def self.err_log(msg)
|
|
54
|
-
|
|
54
|
+
$stderr.puts("[ERROR] #{msg}")
|
|
55
55
|
end
|
|
56
56
|
end
|
|
57
57
|
|
data/lib/site_mapper/version.rb
CHANGED