coolCrawler 0.3.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e927853d47dec36f11557336b9a7c1f2d986b1a9933b78878f2cef18f585cf54
4
- data.tar.gz: 67a189af1c21a32650b4cd58573aac781cf2f2a43ce6b20a88eb04a3298fbb79
3
+ metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
4
+ data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
5
5
  SHA512:
6
- metadata.gz: 6f3e9b7ff0b17807160670456b7d1bf49079760accef770620e0689e66234f6737dd90c973178f1bc600d22c2bc114e95d198fd756a656f006c2d9f35e2b7167
7
- data.tar.gz: 2e03b27c2142c7eb389df2e4955070924c843e92d2a167e6aa1b5411e2698bcfbc551999542575b46eda561a949eaca2a3445a1e7db2ba6304700b58ed8a35b7
6
+ metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
7
+ data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.3.0"
4
+ VERSION = "0.4.2"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -13,11 +13,13 @@ module CoolCrawler
13
13
  # This is the class that handles the queue and async requests
14
14
  class CrawlerPool
15
15
 
16
- def initialize(start, max_connections, delay)
16
+ def initialize(start, max_connections, delay, max_pages=50)
17
17
  @uri = URI(start)
18
+ @max_pages = max_pages
18
19
  @site = "#{uri.scheme}://#{uri.host}"
19
20
  @max_connections = max_connections
20
21
  @delay = delay
22
+ @visited_pages = 0
21
23
  visited[uri.path] = 1
22
24
  queue << uri.path
23
25
  end
@@ -35,8 +37,8 @@ module CoolCrawler
35
37
  end
36
38
  end
37
39
 
38
- def after(page, links)
39
- callback.call(page, links) unless callback.nil?
40
+ def after(page, links, body)
41
+ callback.call(page, links, body) unless callback.nil?
40
42
  end
41
43
 
42
44
  def send_crawlers
@@ -49,10 +51,12 @@ module CoolCrawler
49
51
  pages.each do |page|
50
52
  barrier.async do
51
53
  response = internet.get URI.join(@site, page).to_s
52
- links = gather_links_uri(response.read, URI.join(uri, page))
53
- after(page, links)
54
+ body = response.read
55
+ links = gather_links_uri(body, URI.join(uri, page))
56
+ after(page, links, body)
54
57
  links.each do |link|
55
58
  enqueue(link)
59
+ @visited_pages += 1
56
60
  add_to_visited(link)
57
61
  end
58
62
  end
@@ -103,7 +107,18 @@ module CoolCrawler
103
107
  end
104
108
 
105
109
  def enqueue(path)
106
- queue << path unless visited.include?(path)
110
+ unless visited.include?(path) or @visited_pages > @max_pages
111
+ queue << path
112
+ p queue.size
113
+ end
114
+ end
115
+
116
+ def sum_pages
117
+ sum = 0
118
+ visited.each do |_k, v|
119
+ sum += v
120
+ end
121
+ sum
107
122
  end
108
123
  end
109
124
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-05 00:00:00.000000000 Z
11
+ date: 2022-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec