coolCrawler 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/coolCrawler/version.rb +1 -1
- data/lib/cool_crawler.rb +6 -6
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79059c4a0ef1c026082b3ef2b25ee13c2310984a2049be4345f0345b226d2e99
|
4
|
+
data.tar.gz: 32d02d444ef3553df02c2764073b1d9f5cbdabea272ee1d2b3eb9e80a5a29434
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56d89149672219eb082eb188d2d79fde2d97931efe8295983719a5693c5b96c07e607a309d03690e8a739a2a3018f73c7d0fbd18da6003cbafa7969633890b2e
|
7
|
+
data.tar.gz: f37a6061b4b318cb19423ab4de961a98b625d1e17e6f85eb78524794a967a2f5dda4bceaa5c5414b2df33a359c9c0fd5a8e8a7bfe22c4f872e41a9e9c5f46d48
|
data/lib/coolCrawler/version.rb
CHANGED
data/lib/cool_crawler.rb
CHANGED
@@ -31,7 +31,7 @@ module CoolCrawler
|
|
31
31
|
end
|
32
32
|
|
33
33
|
def run
|
34
|
-
until queue.empty?
|
34
|
+
until queue.empty? || @visited_pages >= @max_pages
|
35
35
|
send_crawlers
|
36
36
|
sleep(delay)
|
37
37
|
end
|
@@ -43,7 +43,10 @@ module CoolCrawler
|
|
43
43
|
|
44
44
|
def send_crawlers
|
45
45
|
pages = []
|
46
|
-
|
46
|
+
until queue.empty? || pages.size >= max_connections || @visited_pages >= @max_pages
|
47
|
+
pages << queue.pop
|
48
|
+
@visited_pages += 1
|
49
|
+
end
|
47
50
|
Async do
|
48
51
|
internet = Async::HTTP::Internet.new
|
49
52
|
barrier = Async::Barrier.new
|
@@ -76,9 +79,7 @@ module CoolCrawler
|
|
76
79
|
next if a["href"].nil?
|
77
80
|
uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
|
78
81
|
begin
|
79
|
-
|
80
|
-
links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
|
81
|
-
end
|
82
|
+
links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
|
82
83
|
rescue
|
83
84
|
# do nothing
|
84
85
|
end
|
@@ -104,7 +105,6 @@ module CoolCrawler
|
|
104
105
|
else
|
105
106
|
visited[path] = 1
|
106
107
|
end
|
107
|
-
@visited_pages += 1
|
108
108
|
end
|
109
109
|
|
110
110
|
def sorted_visited
|