site_mapper 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 358ce69fd3d2bf6f019b176cf5d3a86c381fd4c4
4
- data.tar.gz: 710c2f869c310189d9f91cac438868797dfba59b
3
+ metadata.gz: b8ce68b3debd6d26847da79293a2c84411423cf7
4
+ data.tar.gz: 471bea81d090f201590b1d56ab99e7d7de09e21c
5
5
  SHA512:
6
- metadata.gz: c592d4f952f005346f45ddd99686c9882d362612b6962ead35c83962d41ef9bb0d3e64565c74d905ad7abf336c896131633ba51574b16b5816c9d419ea2bc5c7
7
- data.tar.gz: 0c98a7f7d69148b9a9e41ab89e80c9fd4c737df87e48f15424234576c7cc676671098cc512953a401bbf3bf2112875e7dedcb02b5f32ad589a8b7217206bc9c0
6
+ metadata.gz: 158d46c3b674c7854eab08413bfb2b58656712d39bb29ca714d6c2997e8c826508542ac13e6a20286b6066b30ac51d223343314cfb77e7d072c9b51da5fa01d9
7
+ data.tar.gz: d7467e386d748bc41ef3f72180e36a321fa0d0d47a9e92a6e76d5aa202c9f2f38c784ff8f896359e04f5fd9f6aa55140da526e03d1e8d147e0dc73d5308c3f13
@@ -1,4 +1,5 @@
1
1
  module SiteMapper
2
+ # Crawl URL formatter
2
3
  class CrawlUrl
3
4
  attr_reader :resolved_base_url, :base_hostname
4
5
 
@@ -8,6 +9,14 @@ module SiteMapper
8
9
  @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
9
10
  end
10
11
 
12
+ # Given a link it constructs the absolute path,
13
+ # if valid URL & URL has same domain as @resolved_base_url.
14
+ # @return [String] with absolute path to resource.
15
+ # @param [String, String] raw_url from link element and current page URL
16
+ # @example Construct absolute URL for '/path', example.com
17
+ # cu = CrawlUrl.new('example.com')
18
+ # cu.absolute_url_from('/path', 'example.com/some/path')
19
+ # # => http://example.com/some/path
11
20
  def absolute_url_from(raw_url, get_url)
12
21
  return nil unless eligible_url?(raw_url)
13
22
  parsed_url = URI.parse(raw_url) rescue URI.parse('')
@@ -17,14 +17,26 @@ module SiteMapper
17
17
  @fetch_queue << @crawl_url.resolved_base_url
18
18
  end
19
19
 
20
+ # @see #collect_urls
20
21
  def self.collect_urls(base_url)
21
- new(base_url).collect_urls
22
+ new(base_url).collect_urls { |url| yield(url) }
22
23
  end
23
24
 
25
+ # Collects all links on domain for domain
26
+ # @return [Array] with links.
27
+ # @example URLs for example.com
28
+ # crawler = Crawler.new('example.com')
29
+ # crawler.collect_urls
30
+ # @example URLs for example.com with block
31
+ # crawler = Crawler.new('example.com')
32
+ # crawler.collect_urls do |new_url|
33
+ # puts "New URL found: #{new_url}"
34
+ # end
24
35
  def collect_urls
25
36
  until @fetch_queue.empty?
26
37
  url = @fetch_queue.first
27
38
  @fetch_queue.delete(@fetch_queue.first)
39
+ Thread.new { yield(url) if block_given? }
28
40
  page_links(url)
29
41
  end
30
42
  puts "Crawling finished, #{@processed.length} links found"
@@ -43,8 +55,8 @@ module SiteMapper
43
55
  link_elements.each do |page_link|
44
56
  absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
57
  if absolute_url
46
- resolved_url = resolve(absolute_url)
47
- @fetch_queue << resolved_url if !@processed.include?(resolved_url)
58
+ url = resolve(absolute_url)
59
+ @fetch_queue << url unless @processed.include?(url)
48
60
  end
49
61
  end
50
62
  end
@@ -1,3 +1,3 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
data/lib/site_mapper.rb CHANGED
@@ -6,7 +6,13 @@ require 'site_mapper/request'
6
6
  require 'site_mapper/crawler'
7
7
  require 'site_mapper/crawl_url'
8
8
 
9
+ # Find all links on domain to domain
9
10
  module SiteMapper
11
+ # Returns all links found on domain to domain.
12
+ # @return [Array] with links.
13
+ # @param [String] URL for domain
14
+ # @example Collect all URLs from example.com
15
+ # SiteMapper.map('example.com')
10
16
  def self.map(source)
11
17
  Crawler.collect_urls(source)
12
18
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam