site_mapper 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +9 -0
- data/lib/site_mapper/crawler.rb +15 -3
- data/lib/site_mapper/version.rb +1 -1
- data/lib/site_mapper.rb +6 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8ce68b3debd6d26847da79293a2c84411423cf7
|
4
|
+
data.tar.gz: 471bea81d090f201590b1d56ab99e7d7de09e21c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 158d46c3b674c7854eab08413bfb2b58656712d39bb29ca714d6c2997e8c826508542ac13e6a20286b6066b30ac51d223343314cfb77e7d072c9b51da5fa01d9
|
7
|
+
data.tar.gz: d7467e386d748bc41ef3f72180e36a321fa0d0d47a9e92a6e76d5aa202c9f2f38c784ff8f896359e04f5fd9f6aa55140da526e03d1e8d147e0dc73d5308c3f13
|
@@ -1,4 +1,5 @@
|
|
1
1
|
module SiteMapper
|
2
|
+
# Crawl URL formatter
|
2
3
|
class CrawlUrl
|
3
4
|
attr_reader :resolved_base_url, :base_hostname
|
4
5
|
|
@@ -8,6 +9,14 @@ module SiteMapper
|
|
8
9
|
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
9
10
|
end
|
10
11
|
|
12
|
+
# Given a link it constructs the absolute path,
|
13
|
+
# if valid URL & URL has same domain as @resolved_base_url.
|
14
|
+
# @return [String] with absolute path to resource.
|
15
|
+
# @param [String, String] raw_url from link element and current page URL
|
16
|
+
# @example Construct absolute URL for '/path', example.com
|
17
|
+
# cu = CrawlUrl.new('example.com')
|
18
|
+
# cu.absolute_url_from('/path', 'example.com/some/path')
|
19
|
+
# # => http://example.com/some/path
|
11
20
|
def absolute_url_from(raw_url, get_url)
|
12
21
|
return nil unless eligible_url?(raw_url)
|
13
22
|
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -17,14 +17,26 @@ module SiteMapper
|
|
17
17
|
@fetch_queue << @crawl_url.resolved_base_url
|
18
18
|
end
|
19
19
|
|
20
|
+
# @see #collect_urls
|
20
21
|
def self.collect_urls(base_url)
|
21
|
-
new(base_url).collect_urls
|
22
|
+
new(base_url).collect_urls { |url| yield(url) }
|
22
23
|
end
|
23
24
|
|
25
|
+
# Collects all links on domain for domain
|
26
|
+
# @return [Array] with links.
|
27
|
+
# @example URLs for example.com
|
28
|
+
# crawler = Crawler.new('example.com')
|
29
|
+
# crawler.collect_urls
|
30
|
+
# @example URLs for example.com with block
|
31
|
+
# crawler = Crawler.new('example.com')
|
32
|
+
# crawler.collect_urls do |new_url|
|
33
|
+
# puts "New URL found: #{new_url}"
|
34
|
+
# end
|
24
35
|
def collect_urls
|
25
36
|
until @fetch_queue.empty?
|
26
37
|
url = @fetch_queue.first
|
27
38
|
@fetch_queue.delete(@fetch_queue.first)
|
39
|
+
Thread.new { yield(url) if block_given? }
|
28
40
|
page_links(url)
|
29
41
|
end
|
30
42
|
puts "Crawling finished, #{@processed.length} links found"
|
@@ -43,8 +55,8 @@ module SiteMapper
|
|
43
55
|
link_elements.each do |page_link|
|
44
56
|
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
57
|
if absolute_url
|
46
|
-
|
47
|
-
@fetch_queue <<
|
58
|
+
url = resolve(absolute_url)
|
59
|
+
@fetch_queue << url unless @processed.include?(url)
|
48
60
|
end
|
49
61
|
end
|
50
62
|
end
|
data/lib/site_mapper/version.rb
CHANGED
data/lib/site_mapper.rb
CHANGED
@@ -6,7 +6,13 @@ require 'site_mapper/request'
|
|
6
6
|
require 'site_mapper/crawler'
|
7
7
|
require 'site_mapper/crawl_url'
|
8
8
|
|
9
|
+
# Find all links on domain to domain
|
9
10
|
module SiteMapper
|
11
|
+
# Returns all links found on domain to domain.
|
12
|
+
# @return [Array] with links.
|
13
|
+
# @param [String] URL for domain
|
14
|
+
# @example Collect all URLs from example.com
|
15
|
+
# SiteMapper.map('example.com')
|
10
16
|
def self.map(source)
|
11
17
|
Crawler.collect_urls(source)
|
12
18
|
end
|