site_mapper 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +1 -1
- data/lib/site_mapper/crawler.rb +18 -5
- data/lib/site_mapper/request.rb +9 -2
- data/lib/site_mapper/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6870976b0c732915cd3077c0be8bbfd6f92d7f45
|
4
|
+
data.tar.gz: 1f2949d91774c87dd2f156b562ffb9e532f924fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d736d6c6d62c55e9f38417bb9061dfb19de907ef00d2e41cb31bd1d190d92fd16b319d02483673c333cfcc6cf6feed720906b2ba914957f13f602fc595ef2223
|
7
|
+
data.tar.gz: 69661165ac90ede2611f2d24ba43c58c1eed7b17c5405b23d00df0de023e4896210f75ab152e8d2311832ee4aa828d17c59887e0e943e00b8a0daa34d2d3b1ad
|
@@ -4,7 +4,7 @@ module SiteMapper
|
|
4
4
|
attr_reader :resolved_base_url, :base_hostname
|
5
5
|
|
6
6
|
def initialize(base_url)
|
7
|
-
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
+
@resolved_base_url = Request.resolve_url(base_url, with_query: false)
|
8
8
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
9
9
|
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
10
10
|
end
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -12,9 +12,8 @@ module SiteMapper
|
|
12
12
|
base_url = Request.resolve_url(url)
|
13
13
|
@options = { resolve: resolve }
|
14
14
|
@crawl_url = CrawlUrl.new(base_url)
|
15
|
-
@fetch_queue =
|
15
|
+
@fetch_queue = CrawlQueue.new
|
16
16
|
@processed = Set.new
|
17
|
-
@fetch_queue << @crawl_url.resolved_base_url
|
18
17
|
end
|
19
18
|
|
20
19
|
# @see #collect_urls
|
@@ -27,15 +26,15 @@ module SiteMapper
|
|
27
26
|
# @example URLs for example.com
|
28
27
|
# crawler = Crawler.new('example.com')
|
29
28
|
# crawler.collect_urls
|
30
|
-
# @example URLs for example.com with block
|
29
|
+
# @example URLs for example.com with block (executes in its own thread)
|
31
30
|
# crawler = Crawler.new('example.com')
|
32
31
|
# crawler.collect_urls do |new_url|
|
33
32
|
# puts "New URL found: #{new_url}"
|
34
33
|
# end
|
35
34
|
def collect_urls
|
35
|
+
@fetch_queue << @crawl_url.resolved_base_url
|
36
36
|
until @fetch_queue.empty?
|
37
|
-
url = @fetch_queue.
|
38
|
-
@fetch_queue.delete(@fetch_queue.first)
|
37
|
+
url = @fetch_queue.pop
|
39
38
|
Thread.new { yield(url) if block_given? }
|
40
39
|
page_links(url)
|
41
40
|
end
|
@@ -65,4 +64,18 @@ module SiteMapper
|
|
65
64
|
@options[:resolve] ? Request.resolve_url(url) : url
|
66
65
|
end
|
67
66
|
end
|
67
|
+
|
68
|
+
class CrawlQueue
|
69
|
+
def self.new
|
70
|
+
Set.new.extend(EnumerablePop)
|
71
|
+
end
|
72
|
+
|
73
|
+
module EnumerablePop
|
74
|
+
def pop
|
75
|
+
first_element = first
|
76
|
+
delete(first_element)
|
77
|
+
first_element
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
68
81
|
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -21,8 +21,15 @@ module SiteMapper
|
|
21
21
|
http.request(request)
|
22
22
|
end
|
23
23
|
|
24
|
-
def resolve_url(url)
|
25
|
-
UrlResolver.resolve(url)
|
24
|
+
def resolve_url(url, with_query: true)
|
25
|
+
resolved = UrlResolver.resolve(url)
|
26
|
+
resolved = remove_query(resolved) unless with_query
|
27
|
+
resolved
|
28
|
+
end
|
29
|
+
|
30
|
+
def remove_query(url)
|
31
|
+
index = url.index('?')
|
32
|
+
index.nil? ? url : url[0...index]
|
26
33
|
end
|
27
34
|
end
|
28
35
|
end
|
data/lib/site_mapper/version.rb
CHANGED