site_mapper 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2bab1dcdbee3e862d0b7a5ac04e5e6995742ec37
4
- data.tar.gz: df5e1bb8503c30d32f446dc9840412171eba4eb5
3
+ metadata.gz: 6870976b0c732915cd3077c0be8bbfd6f92d7f45
4
+ data.tar.gz: 1f2949d91774c87dd2f156b562ffb9e532f924fe
5
5
  SHA512:
6
- metadata.gz: 20c921483737e916b0009598124c03bc29f5ba919fa76b718ac6bf84550903eba542aee2e37eda3a6411cef2c0da7982809235454d7d0b7447c09c6fad5816d9
7
- data.tar.gz: 81e0f10b5e39c355b0a3be35742aec7c1cda6d1a6adf3de6113d41ef1706cdcaa0a87eec87eb473fc7cb3d8b5e2ce3abfbf70521205ed01f4c7c39c01f0b3c95
6
+ metadata.gz: d736d6c6d62c55e9f38417bb9061dfb19de907ef00d2e41cb31bd1d190d92fd16b319d02483673c333cfcc6cf6feed720906b2ba914957f13f602fc595ef2223
7
+ data.tar.gz: 69661165ac90ede2611f2d24ba43c58c1eed7b17c5405b23d00df0de023e4896210f75ab152e8d2311832ee4aa828d17c59887e0e943e00b8a0daa34d2d3b1ad
@@ -4,7 +4,7 @@ module SiteMapper
4
4
  attr_reader :resolved_base_url, :base_hostname
5
5
 
6
6
  def initialize(base_url)
7
- @resolved_base_url = Request.resolve_url(base_url)
7
+ @resolved_base_url = Request.resolve_url(base_url, with_query: false)
8
8
  @base_hostname = URI.parse(@resolved_base_url).hostname
9
9
  @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
10
10
  end
@@ -12,9 +12,8 @@ module SiteMapper
12
12
  base_url = Request.resolve_url(url)
13
13
  @options = { resolve: resolve }
14
14
  @crawl_url = CrawlUrl.new(base_url)
15
- @fetch_queue = Set.new
15
+ @fetch_queue = CrawlQueue.new
16
16
  @processed = Set.new
17
- @fetch_queue << @crawl_url.resolved_base_url
18
17
  end
19
18
 
20
19
  # @see #collect_urls
@@ -27,15 +26,15 @@ module SiteMapper
27
26
  # @example URLs for example.com
28
27
  # crawler = Crawler.new('example.com')
29
28
  # crawler.collect_urls
30
- # @example URLs for example.com with block
29
+ # @example URLs for example.com with block (executes in its own thread)
31
30
  # crawler = Crawler.new('example.com')
32
31
  # crawler.collect_urls do |new_url|
33
32
  # puts "New URL found: #{new_url}"
34
33
  # end
35
34
  def collect_urls
35
+ @fetch_queue << @crawl_url.resolved_base_url
36
36
  until @fetch_queue.empty?
37
- url = @fetch_queue.first
38
- @fetch_queue.delete(@fetch_queue.first)
37
+ url = @fetch_queue.pop
39
38
  Thread.new { yield(url) if block_given? }
40
39
  page_links(url)
41
40
  end
@@ -65,4 +64,18 @@ module SiteMapper
65
64
  @options[:resolve] ? Request.resolve_url(url) : url
66
65
  end
67
66
  end
67
+
68
+ class CrawlQueue
69
+ def self.new
70
+ Set.new.extend(EnumerablePop)
71
+ end
72
+
73
+ module EnumerablePop
74
+ def pop
75
+ first_element = first
76
+ delete(first_element)
77
+ first_element
78
+ end
79
+ end
80
+ end
68
81
  end
@@ -21,8 +21,15 @@ module SiteMapper
21
21
  http.request(request)
22
22
  end
23
23
 
24
- def resolve_url(url)
25
- UrlResolver.resolve(url)
24
+ def resolve_url(url, with_query: true)
25
+ resolved = UrlResolver.resolve(url)
26
+ resolved = remove_query(resolved) unless with_query
27
+ resolved
28
+ end
29
+
30
+ def remove_query(url)
31
+ index = url.index('?')
32
+ index.nil? ? url : url[0...index]
26
33
  end
27
34
  end
28
35
  end
@@ -1,3 +1,3 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.5'
2
+ VERSION = '0.0.6'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam