coolCrawler 0.3.0 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/coolCrawler/version.rb +1 -1
- data/lib/cool_crawler.rb +21 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
|
4
|
+
data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
|
7
|
+
data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
|
data/lib/coolCrawler/version.rb
CHANGED
data/lib/cool_crawler.rb
CHANGED
@@ -13,11 +13,13 @@ module CoolCrawler
|
|
13
13
|
# This is the class that handles the queue and async requests
|
14
14
|
class CrawlerPool
|
15
15
|
|
16
|
-
def initialize(start, max_connections, delay)
|
16
|
+
def initialize(start, max_connections, delay, max_pages=50)
|
17
17
|
@uri = URI(start)
|
18
|
+
@max_pages = max_pages
|
18
19
|
@site = "#{uri.scheme}://#{uri.host}"
|
19
20
|
@max_connections = max_connections
|
20
21
|
@delay = delay
|
22
|
+
@visited_pages = 0
|
21
23
|
visited[uri.path] = 1
|
22
24
|
queue << uri.path
|
23
25
|
end
|
@@ -35,8 +37,8 @@ module CoolCrawler
|
|
35
37
|
end
|
36
38
|
end
|
37
39
|
|
38
|
-
def after(page, links)
|
39
|
-
callback.call(page, links) unless callback.nil?
|
40
|
+
def after(page, links, body)
|
41
|
+
callback.call(page, links, body) unless callback.nil?
|
40
42
|
end
|
41
43
|
|
42
44
|
def send_crawlers
|
@@ -49,10 +51,12 @@ module CoolCrawler
|
|
49
51
|
pages.each do |page|
|
50
52
|
barrier.async do
|
51
53
|
response = internet.get URI.join(@site, page).to_s
|
52
|
-
|
53
|
-
|
54
|
+
body = response.read
|
55
|
+
links = gather_links_uri(body, URI.join(uri, page))
|
56
|
+
after(page, links, body)
|
54
57
|
links.each do |link|
|
55
58
|
enqueue(link)
|
59
|
+
@visited_pages += 1
|
56
60
|
add_to_visited(link)
|
57
61
|
end
|
58
62
|
end
|
@@ -103,7 +107,18 @@ module CoolCrawler
|
|
103
107
|
end
|
104
108
|
|
105
109
|
def enqueue(path)
|
106
|
-
|
110
|
+
unless visited.include?(path) or @visited_pages > @max_pages
|
111
|
+
queue << path
|
112
|
+
p queue.size
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def sum_pages
|
117
|
+
sum = 0
|
118
|
+
visited.each do |_k, v|
|
119
|
+
sum += v
|
120
|
+
end
|
121
|
+
sum
|
107
122
|
end
|
108
123
|
end
|
109
124
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coolCrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- William Wright
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-10-
|
11
|
+
date: 2022-10-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|