coolCrawler 0.4.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7aecaddb63af3eb7a621f2cd75472291e0376ca57ef30ec229edcbe274bc704b
4
- data.tar.gz: f42a0b9230a6886169cc7dd35f25f03dcb8b1f6d53ec5515c32ed647e2249ee5
3
+ metadata.gz: a8c2944180ee7c5d7f1241fd62bf5e89973a2fca742c51567e1550f14f95f70a
4
+ data.tar.gz: 6635f39af2babaead932e94cae419738d7bd61e0ecc2f23216cb6102607904bf
5
5
  SHA512:
6
- metadata.gz: 36442036c64c324adf090c7ccb4b6cc474875471ac1dc54ead21604e9de8209036c608e18d0b83bf97e37281720c66cf5b4a8084fb97ed7d81aa7aa1f58e7387
7
- data.tar.gz: 38a40361a3ee2d1b34efbf21e482fa4663f89d20731490e1ebd2e6e014cd66537cd95e0da9fdef14ec33d02f59d0aa054fcca383e158e2ed9fbe544501771ce6
6
+ metadata.gz: 4f56da51ab47060e7d58b3ac469ca15501b2b457a936b634b361afa0f832a27130c74169d7c1da4f3a188f56db58335c59325de261990ee02bbdbd1431ad24da
7
+ data.tar.gz: 77dc4e5cdb5d0098ba8f91279bc4d658c653130313fbb49de52c3e2c2b707c4501fcf77a31f2a6a54b0b48ca12e54ab674423ab377e22e043a6e6cce0093a6df
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.4.0"
4
+ VERSION = "0.4.3"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -13,11 +13,13 @@ module CoolCrawler
13
13
  # This is the class that handles the queue and async requests
14
14
  class CrawlerPool
15
15
 
16
- def initialize(start, max_connections, delay)
16
+ def initialize(start, max_connections, delay, max_pages=50)
17
17
  @uri = URI(start)
18
+ @max_pages = max_pages
18
19
  @site = "#{uri.scheme}://#{uri.host}"
19
20
  @max_connections = max_connections
20
21
  @delay = delay
22
+ @visited_pages = 0
21
23
  visited[uri.path] = 1
22
24
  queue << uri.path
23
25
  end
@@ -49,9 +51,12 @@ module CoolCrawler
49
51
  pages.each do |page|
50
52
  barrier.async do
51
53
  response = internet.get URI.join(@site, page).to_s
52
- body = response.read
53
- links = gather_links_uri(body, URI.join(uri, page))
54
- after(page, links, body)
54
+ body = Nokogiri::HTML(response.read)
55
+ body.search('//img').remove
56
+ body.search('//style').remove
57
+ body.search('//script').remove
58
+ links = gather_links_uri(body.to_s, URI.join(uri, page))
59
+ after(page, links, body.to_s)
55
60
  links.each do |link|
56
61
  enqueue(link)
57
62
  add_to_visited(link)
@@ -71,7 +76,9 @@ module CoolCrawler
71
76
  next if a["href"].nil?
72
77
  uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
73
78
  begin
74
- links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
79
+ if @visited_pages <= @max_pages
80
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
81
+ end
75
82
  rescue
76
83
  # do nothing
77
84
  end
@@ -97,6 +104,7 @@ module CoolCrawler
97
104
  else
98
105
  visited[path] = 1
99
106
  end
107
+ @visited_pages += 1
100
108
  end
101
109
 
102
110
  def sorted_visited
@@ -104,7 +112,17 @@ module CoolCrawler
104
112
  end
105
113
 
106
114
  def enqueue(path)
107
- queue << path unless visited.include?(path)
115
+ unless visited.include?(path)
116
+ queue << path
117
+ end
118
+ end
119
+
120
+ def sum_pages
121
+ sum = 0
122
+ visited.each do |_k, v|
123
+ sum += v
124
+ end
125
+ sum
108
126
  end
109
127
  end
110
128
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-14 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec