site_mapper 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2bab1dcdbee3e862d0b7a5ac04e5e6995742ec37
4
- data.tar.gz: df5e1bb8503c30d32f446dc9840412171eba4eb5
3
+ metadata.gz: 6870976b0c732915cd3077c0be8bbfd6f92d7f45
4
+ data.tar.gz: 1f2949d91774c87dd2f156b562ffb9e532f924fe
5
5
  SHA512:
6
- metadata.gz: 20c921483737e916b0009598124c03bc29f5ba919fa76b718ac6bf84550903eba542aee2e37eda3a6411cef2c0da7982809235454d7d0b7447c09c6fad5816d9
7
- data.tar.gz: 81e0f10b5e39c355b0a3be35742aec7c1cda6d1a6adf3de6113d41ef1706cdcaa0a87eec87eb473fc7cb3d8b5e2ce3abfbf70521205ed01f4c7c39c01f0b3c95
6
+ metadata.gz: d736d6c6d62c55e9f38417bb9061dfb19de907ef00d2e41cb31bd1d190d92fd16b319d02483673c333cfcc6cf6feed720906b2ba914957f13f602fc595ef2223
7
+ data.tar.gz: 69661165ac90ede2611f2d24ba43c58c1eed7b17c5405b23d00df0de023e4896210f75ab152e8d2311832ee4aa828d17c59887e0e943e00b8a0daa34d2d3b1ad
@@ -4,7 +4,7 @@ module SiteMapper
4
4
  attr_reader :resolved_base_url, :base_hostname
5
5
 
6
6
  def initialize(base_url)
7
- @resolved_base_url = Request.resolve_url(base_url)
7
+ @resolved_base_url = Request.resolve_url(base_url, with_query: false)
8
8
  @base_hostname = URI.parse(@resolved_base_url).hostname
9
9
  @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
10
10
  end
@@ -12,9 +12,8 @@ module SiteMapper
12
12
  base_url = Request.resolve_url(url)
13
13
  @options = { resolve: resolve }
14
14
  @crawl_url = CrawlUrl.new(base_url)
15
- @fetch_queue = Set.new
15
+ @fetch_queue = CrawlQueue.new
16
16
  @processed = Set.new
17
- @fetch_queue << @crawl_url.resolved_base_url
18
17
  end
19
18
 
20
19
  # @see #collect_urls
@@ -27,15 +26,15 @@ module SiteMapper
27
26
  # @example URLs for example.com
28
27
  # crawler = Crawler.new('example.com')
29
28
  # crawler.collect_urls
30
- # @example URLs for example.com with block
29
+ # @example URLs for example.com with block (executes in its own thread)
31
30
  # crawler = Crawler.new('example.com')
32
31
  # crawler.collect_urls do |new_url|
33
32
  # puts "New URL found: #{new_url}"
34
33
  # end
35
34
  def collect_urls
35
+ @fetch_queue << @crawl_url.resolved_base_url
36
36
  until @fetch_queue.empty?
37
- url = @fetch_queue.first
38
- @fetch_queue.delete(@fetch_queue.first)
37
+ url = @fetch_queue.pop
39
38
  Thread.new { yield(url) if block_given? }
40
39
  page_links(url)
41
40
  end
@@ -65,4 +64,18 @@ module SiteMapper
65
64
  @options[:resolve] ? Request.resolve_url(url) : url
66
65
  end
67
66
  end
67
+
68
+ class CrawlQueue
69
+ def self.new
70
+ Set.new.extend(EnumerablePop)
71
+ end
72
+
73
+ module EnumerablePop
74
+ def pop
75
+ first_element = first
76
+ delete(first_element)
77
+ first_element
78
+ end
79
+ end
80
+ end
68
81
  end
@@ -21,8 +21,15 @@ module SiteMapper
21
21
  http.request(request)
22
22
  end
23
23
 
24
- def resolve_url(url)
25
- UrlResolver.resolve(url)
24
+ def resolve_url(url, with_query: true)
25
+ resolved = UrlResolver.resolve(url)
26
+ resolved = remove_query(resolved) unless with_query
27
+ resolved
28
+ end
29
+
30
+ def remove_query(url)
31
+ index = url.index('?')
32
+ index.nil? ? url : url[0...index]
26
33
  end
27
34
  end
28
35
  end
@@ -1,3 +1,3 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.5'
2
+ VERSION = '0.0.6'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam