site_mapper 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/site_mapper/crawl_url.rb +1 -1
- data/lib/site_mapper/crawler.rb +18 -5
- data/lib/site_mapper/request.rb +9 -2
- data/lib/site_mapper/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6870976b0c732915cd3077c0be8bbfd6f92d7f45
|
4
|
+
data.tar.gz: 1f2949d91774c87dd2f156b562ffb9e532f924fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d736d6c6d62c55e9f38417bb9061dfb19de907ef00d2e41cb31bd1d190d92fd16b319d02483673c333cfcc6cf6feed720906b2ba914957f13f602fc595ef2223
|
7
|
+
data.tar.gz: 69661165ac90ede2611f2d24ba43c58c1eed7b17c5405b23d00df0de023e4896210f75ab152e8d2311832ee4aa828d17c59887e0e943e00b8a0daa34d2d3b1ad
|
@@ -4,7 +4,7 @@ module SiteMapper
|
|
4
4
|
attr_reader :resolved_base_url, :base_hostname
|
5
5
|
|
6
6
|
def initialize(base_url)
|
7
|
-
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
+
@resolved_base_url = Request.resolve_url(base_url, with_query: false)
|
8
8
|
@base_hostname = URI.parse(@resolved_base_url).hostname
|
9
9
|
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
10
10
|
end
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -12,9 +12,8 @@ module SiteMapper
|
|
12
12
|
base_url = Request.resolve_url(url)
|
13
13
|
@options = { resolve: resolve }
|
14
14
|
@crawl_url = CrawlUrl.new(base_url)
|
15
|
-
@fetch_queue =
|
15
|
+
@fetch_queue = CrawlQueue.new
|
16
16
|
@processed = Set.new
|
17
|
-
@fetch_queue << @crawl_url.resolved_base_url
|
18
17
|
end
|
19
18
|
|
20
19
|
# @see #collect_urls
|
@@ -27,15 +26,15 @@ module SiteMapper
|
|
27
26
|
# @example URLs for example.com
|
28
27
|
# crawler = Crawler.new('example.com')
|
29
28
|
# crawler.collect_urls
|
30
|
-
# @example URLs for example.com with block
|
29
|
+
# @example URLs for example.com with block (executes in its own thread)
|
31
30
|
# crawler = Crawler.new('example.com')
|
32
31
|
# crawler.collect_urls do |new_url|
|
33
32
|
# puts "New URL found: #{new_url}"
|
34
33
|
# end
|
35
34
|
def collect_urls
|
35
|
+
@fetch_queue << @crawl_url.resolved_base_url
|
36
36
|
until @fetch_queue.empty?
|
37
|
-
url = @fetch_queue.
|
38
|
-
@fetch_queue.delete(@fetch_queue.first)
|
37
|
+
url = @fetch_queue.pop
|
39
38
|
Thread.new { yield(url) if block_given? }
|
40
39
|
page_links(url)
|
41
40
|
end
|
@@ -65,4 +64,18 @@ module SiteMapper
|
|
65
64
|
@options[:resolve] ? Request.resolve_url(url) : url
|
66
65
|
end
|
67
66
|
end
|
67
|
+
|
68
|
+
class CrawlQueue
|
69
|
+
def self.new
|
70
|
+
Set.new.extend(EnumerablePop)
|
71
|
+
end
|
72
|
+
|
73
|
+
module EnumerablePop
|
74
|
+
def pop
|
75
|
+
first_element = first
|
76
|
+
delete(first_element)
|
77
|
+
first_element
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
68
81
|
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -21,8 +21,15 @@ module SiteMapper
|
|
21
21
|
http.request(request)
|
22
22
|
end
|
23
23
|
|
24
|
-
def resolve_url(url)
|
25
|
-
UrlResolver.resolve(url)
|
24
|
+
def resolve_url(url, with_query: true)
|
25
|
+
resolved = UrlResolver.resolve(url)
|
26
|
+
resolved = remove_query(resolved) unless with_query
|
27
|
+
resolved
|
28
|
+
end
|
29
|
+
|
30
|
+
def remove_query(url)
|
31
|
+
index = url.index('?')
|
32
|
+
index.nil? ? url : url[0...index]
|
26
33
|
end
|
27
34
|
end
|
28
35
|
end
|
data/lib/site_mapper/version.rb
CHANGED