site_mapper 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +9 -0
- data/lib/site_mapper/crawler.rb +15 -3
- data/lib/site_mapper/version.rb +1 -1
- data/lib/site_mapper.rb +6 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8ce68b3debd6d26847da79293a2c84411423cf7
|
4
|
+
data.tar.gz: 471bea81d090f201590b1d56ab99e7d7de09e21c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 158d46c3b674c7854eab08413bfb2b58656712d39bb29ca714d6c2997e8c826508542ac13e6a20286b6066b30ac51d223343314cfb77e7d072c9b51da5fa01d9
|
7
|
+
data.tar.gz: d7467e386d748bc41ef3f72180e36a321fa0d0d47a9e92a6e76d5aa202c9f2f38c784ff8f896359e04f5fd9f6aa55140da526e03d1e8d147e0dc73d5308c3f13
|
@@ -1,4 +1,5 @@
|
|
1
1
|
module SiteMapper
|
2
|
+
# Crawl URL formatter
|
2
3
|
class CrawlUrl
|
3
4
|
attr_reader :resolved_base_url, :base_hostname
|
4
5
|
|
@@ -8,6 +9,14 @@ module SiteMapper
|
|
8
9
|
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
9
10
|
end
|
10
11
|
|
12
|
+
# Given a link it constructs the absolute path,
|
13
|
+
# if valid URL & URL has same domain as @resolved_base_url.
|
14
|
+
# @return [String] with absolute path to resource.
|
15
|
+
# @param [String, String] raw_url from link element and current page URL
|
16
|
+
# @example Construct absolute URL for '/path', example.com
|
17
|
+
# cu = CrawlUrl.new('example.com')
|
18
|
+
# cu.absolute_url_from('/path', 'example.com/some/path')
|
19
|
+
# # => http://example.com/some/path
|
11
20
|
def absolute_url_from(raw_url, get_url)
|
12
21
|
return nil unless eligible_url?(raw_url)
|
13
22
|
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -17,14 +17,26 @@ module SiteMapper
|
|
17
17
|
@fetch_queue << @crawl_url.resolved_base_url
|
18
18
|
end
|
19
19
|
|
20
|
+
# @see #collect_urls
|
20
21
|
def self.collect_urls(base_url)
|
21
|
-
new(base_url).collect_urls
|
22
|
+
new(base_url).collect_urls { |url| yield(url) }
|
22
23
|
end
|
23
24
|
|
25
|
+
# Collects all links on domain for domain
|
26
|
+
# @return [Array] with links.
|
27
|
+
# @example URLs for example.com
|
28
|
+
# crawler = Crawler.new('example.com')
|
29
|
+
# crawler.collect_urls
|
30
|
+
# @example URLs for example.com with block
|
31
|
+
# crawler = Crawler.new('example.com')
|
32
|
+
# crawler.collect_urls do |new_url|
|
33
|
+
# puts "New URL found: #{new_url}"
|
34
|
+
# end
|
24
35
|
def collect_urls
|
25
36
|
until @fetch_queue.empty?
|
26
37
|
url = @fetch_queue.first
|
27
38
|
@fetch_queue.delete(@fetch_queue.first)
|
39
|
+
Thread.new { yield(url) if block_given? }
|
28
40
|
page_links(url)
|
29
41
|
end
|
30
42
|
puts "Crawling finished, #{@processed.length} links found"
|
@@ -43,8 +55,8 @@ module SiteMapper
|
|
43
55
|
link_elements.each do |page_link|
|
44
56
|
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
57
|
if absolute_url
|
46
|
-
|
47
|
-
@fetch_queue <<
|
58
|
+
url = resolve(absolute_url)
|
59
|
+
@fetch_queue << url unless @processed.include?(url)
|
48
60
|
end
|
49
61
|
end
|
50
62
|
end
|
data/lib/site_mapper/version.rb
CHANGED
data/lib/site_mapper.rb
CHANGED
@@ -6,7 +6,13 @@ require 'site_mapper/request'
|
|
6
6
|
require 'site_mapper/crawler'
|
7
7
|
require 'site_mapper/crawl_url'
|
8
8
|
|
9
|
+
# Find all links on domain to domain
|
9
10
|
module SiteMapper
|
11
|
+
# Returns all links found on domain to domain.
|
12
|
+
# @return [Array] with links.
|
13
|
+
# @param [String] URL for domain
|
14
|
+
# @example Collect all URLs from example.com
|
15
|
+
# SiteMapper.map('example.com')
|
10
16
|
def self.map(source)
|
11
17
|
Crawler.collect_urls(source)
|
12
18
|
end
|