coolCrawler 0.4.0 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7aecaddb63af3eb7a621f2cd75472291e0376ca57ef30ec229edcbe274bc704b
4
- data.tar.gz: f42a0b9230a6886169cc7dd35f25f03dcb8b1f6d53ec5515c32ed647e2249ee5
3
+ metadata.gz: a8c2944180ee7c5d7f1241fd62bf5e89973a2fca742c51567e1550f14f95f70a
4
+ data.tar.gz: 6635f39af2babaead932e94cae419738d7bd61e0ecc2f23216cb6102607904bf
5
5
  SHA512:
6
- metadata.gz: 36442036c64c324adf090c7ccb4b6cc474875471ac1dc54ead21604e9de8209036c608e18d0b83bf97e37281720c66cf5b4a8084fb97ed7d81aa7aa1f58e7387
7
- data.tar.gz: 38a40361a3ee2d1b34efbf21e482fa4663f89d20731490e1ebd2e6e014cd66537cd95e0da9fdef14ec33d02f59d0aa054fcca383e158e2ed9fbe544501771ce6
6
+ metadata.gz: 4f56da51ab47060e7d58b3ac469ca15501b2b457a936b634b361afa0f832a27130c74169d7c1da4f3a188f56db58335c59325de261990ee02bbdbd1431ad24da
7
+ data.tar.gz: 77dc4e5cdb5d0098ba8f91279bc4d658c653130313fbb49de52c3e2c2b707c4501fcf77a31f2a6a54b0b48ca12e54ab674423ab377e22e043a6e6cce0093a6df
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.4.0"
4
+ VERSION = "0.4.3"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -13,11 +13,13 @@ module CoolCrawler
13
13
  # This is the class that handles the queue and async requests
14
14
  class CrawlerPool
15
15
 
16
- def initialize(start, max_connections, delay)
16
+ def initialize(start, max_connections, delay, max_pages=50)
17
17
  @uri = URI(start)
18
+ @max_pages = max_pages
18
19
  @site = "#{uri.scheme}://#{uri.host}"
19
20
  @max_connections = max_connections
20
21
  @delay = delay
22
+ @visited_pages = 0
21
23
  visited[uri.path] = 1
22
24
  queue << uri.path
23
25
  end
@@ -49,9 +51,12 @@ module CoolCrawler
49
51
  pages.each do |page|
50
52
  barrier.async do
51
53
  response = internet.get URI.join(@site, page).to_s
52
- body = response.read
53
- links = gather_links_uri(body, URI.join(uri, page))
54
- after(page, links, body)
54
+ body = Nokogiri::HTML(response.read)
55
+ body.search('//img').remove
56
+ body.search('//style').remove
57
+ body.search('//script').remove
58
+ links = gather_links_uri(body.to_s, URI.join(uri, page))
59
+ after(page, links, body.to_s)
55
60
  links.each do |link|
56
61
  enqueue(link)
57
62
  add_to_visited(link)
@@ -71,7 +76,9 @@ module CoolCrawler
71
76
  next if a["href"].nil?
72
77
  uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
73
78
  begin
74
- links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
79
+ if @visited_pages <= @max_pages
80
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
81
+ end
75
82
  rescue
76
83
  # do nothing
77
84
  end
@@ -97,6 +104,7 @@ module CoolCrawler
97
104
  else
98
105
  visited[path] = 1
99
106
  end
107
+ @visited_pages += 1
100
108
  end
101
109
 
102
110
  def sorted_visited
@@ -104,7 +112,17 @@ module CoolCrawler
104
112
  end
105
113
 
106
114
  def enqueue(path)
107
- queue << path unless visited.include?(path)
115
+ unless visited.include?(path)
116
+ queue << path
117
+ end
118
+ end
119
+
120
+ def sum_pages
121
+ sum = 0
122
+ visited.each do |_k, v|
123
+ sum += v
124
+ end
125
+ sum
108
126
  end
109
127
  end
110
128
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-14 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec