coolCrawler 0.3.0 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e927853d47dec36f11557336b9a7c1f2d986b1a9933b78878f2cef18f585cf54
4
- data.tar.gz: 67a189af1c21a32650b4cd58573aac781cf2f2a43ce6b20a88eb04a3298fbb79
3
+ metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
4
+ data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
5
5
  SHA512:
6
- metadata.gz: 6f3e9b7ff0b17807160670456b7d1bf49079760accef770620e0689e66234f6737dd90c973178f1bc600d22c2bc114e95d198fd756a656f006c2d9f35e2b7167
7
- data.tar.gz: 2e03b27c2142c7eb389df2e4955070924c843e92d2a167e6aa1b5411e2698bcfbc551999542575b46eda561a949eaca2a3445a1e7db2ba6304700b58ed8a35b7
6
+ metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
7
+ data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.3.0"
4
+ VERSION = "0.4.2"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -13,11 +13,13 @@ module CoolCrawler
13
13
  # This is the class that handles the queue and async requests
14
14
  class CrawlerPool
15
15
 
16
- def initialize(start, max_connections, delay)
16
+ def initialize(start, max_connections, delay, max_pages=50)
17
17
  @uri = URI(start)
18
+ @max_pages = max_pages
18
19
  @site = "#{uri.scheme}://#{uri.host}"
19
20
  @max_connections = max_connections
20
21
  @delay = delay
22
+ @visited_pages = 0
21
23
  visited[uri.path] = 1
22
24
  queue << uri.path
23
25
  end
@@ -35,8 +37,8 @@ module CoolCrawler
35
37
  end
36
38
  end
37
39
 
38
- def after(page, links)
39
- callback.call(page, links) unless callback.nil?
40
+ def after(page, links, body)
41
+ callback.call(page, links, body) unless callback.nil?
40
42
  end
41
43
 
42
44
  def send_crawlers
@@ -49,10 +51,12 @@ module CoolCrawler
49
51
  pages.each do |page|
50
52
  barrier.async do
51
53
  response = internet.get URI.join(@site, page).to_s
52
- links = gather_links_uri(response.read, URI.join(uri, page))
53
- after(page, links)
54
+ body = response.read
55
+ links = gather_links_uri(body, URI.join(uri, page))
56
+ after(page, links, body)
54
57
  links.each do |link|
55
58
  enqueue(link)
59
+ @visited_pages += 1
56
60
  add_to_visited(link)
57
61
  end
58
62
  end
@@ -103,7 +107,18 @@ module CoolCrawler
103
107
  end
104
108
 
105
109
  def enqueue(path)
106
- queue << path unless visited.include?(path)
110
+ unless visited.include?(path) or @visited_pages > @max_pages
111
+ queue << path
112
+ p queue.size
113
+ end
114
+ end
115
+
116
+ def sum_pages
117
+ sum = 0
118
+ visited.each do |_k, v|
119
+ sum += v
120
+ end
121
+ sum
107
122
  end
108
123
  end
109
124
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-05 00:00:00.000000000 Z
11
+ date: 2022-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec