site_mapper 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 358ce69fd3d2bf6f019b176cf5d3a86c381fd4c4
4
- data.tar.gz: 710c2f869c310189d9f91cac438868797dfba59b
3
+ metadata.gz: b8ce68b3debd6d26847da79293a2c84411423cf7
4
+ data.tar.gz: 471bea81d090f201590b1d56ab99e7d7de09e21c
5
5
  SHA512:
6
- metadata.gz: c592d4f952f005346f45ddd99686c9882d362612b6962ead35c83962d41ef9bb0d3e64565c74d905ad7abf336c896131633ba51574b16b5816c9d419ea2bc5c7
7
- data.tar.gz: 0c98a7f7d69148b9a9e41ab89e80c9fd4c737df87e48f15424234576c7cc676671098cc512953a401bbf3bf2112875e7dedcb02b5f32ad589a8b7217206bc9c0
6
+ metadata.gz: 158d46c3b674c7854eab08413bfb2b58656712d39bb29ca714d6c2997e8c826508542ac13e6a20286b6066b30ac51d223343314cfb77e7d072c9b51da5fa01d9
7
+ data.tar.gz: d7467e386d748bc41ef3f72180e36a321fa0d0d47a9e92a6e76d5aa202c9f2f38c784ff8f896359e04f5fd9f6aa55140da526e03d1e8d147e0dc73d5308c3f13
@@ -1,4 +1,5 @@
1
1
  module SiteMapper
2
+ # Crawl URL formatter
2
3
  class CrawlUrl
3
4
  attr_reader :resolved_base_url, :base_hostname
4
5
 
@@ -8,6 +9,14 @@ module SiteMapper
8
9
  @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
9
10
  end
10
11
 
12
+ # Given a link it constructs the absolute path,
13
+ # if valid URL & URL has same domain as @resolved_base_url.
14
+ # @return [String] with absolute path to resource.
15
+ # @param [String, String] raw_url from link element and current page URL
16
+ # @example Construct absolute URL for '/path', example.com
17
+ # cu = CrawlUrl.new('example.com')
18
+ # cu.absolute_url_from('/path', 'example.com/some/path')
19
+ # # => http://example.com/some/path
11
20
  def absolute_url_from(raw_url, get_url)
12
21
  return nil unless eligible_url?(raw_url)
13
22
  parsed_url = URI.parse(raw_url) rescue URI.parse('')
@@ -17,14 +17,26 @@ module SiteMapper
17
17
  @fetch_queue << @crawl_url.resolved_base_url
18
18
  end
19
19
 
20
+ # @see #collect_urls
20
21
  def self.collect_urls(base_url)
21
- new(base_url).collect_urls
22
+ new(base_url).collect_urls { |url| yield(url) }
22
23
  end
23
24
 
25
+ # Collects all links on domain for domain
26
+ # @return [Array] with links.
27
+ # @example URLs for example.com
28
+ # crawler = Crawler.new('example.com')
29
+ # crawler.collect_urls
30
+ # @example URLs for example.com with block
31
+ # crawler = Crawler.new('example.com')
32
+ # crawler.collect_urls do |new_url|
33
+ # puts "New URL found: #{new_url}"
34
+ # end
24
35
  def collect_urls
25
36
  until @fetch_queue.empty?
26
37
  url = @fetch_queue.first
27
38
  @fetch_queue.delete(@fetch_queue.first)
39
+ Thread.new { yield(url) if block_given? }
28
40
  page_links(url)
29
41
  end
30
42
  puts "Crawling finished, #{@processed.length} links found"
@@ -43,8 +55,8 @@ module SiteMapper
43
55
  link_elements.each do |page_link|
44
56
  absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
57
  if absolute_url
46
- resolved_url = resolve(absolute_url)
47
- @fetch_queue << resolved_url if !@processed.include?(resolved_url)
58
+ url = resolve(absolute_url)
59
+ @fetch_queue << url unless @processed.include?(url)
48
60
  end
49
61
  end
50
62
  end
@@ -1,3 +1,3 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
data/lib/site_mapper.rb CHANGED
@@ -6,7 +6,13 @@ require 'site_mapper/request'
6
6
  require 'site_mapper/crawler'
7
7
  require 'site_mapper/crawl_url'
8
8
 
9
+ # Find all links on domain to domain
9
10
  module SiteMapper
11
+ # Returns all links found on domain to domain.
12
+ # @return [Array] with links.
13
+ # @param [String] URL for domain
14
+ # @example Collect all URLs from example.com
15
+ # SiteMapper.map('example.com')
10
16
  def self.map(source)
11
17
  Crawler.collect_urls(source)
12
18
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam