coolCrawler 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
4
- data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
3
+ metadata.gz: 79059c4a0ef1c026082b3ef2b25ee13c2310984a2049be4345f0345b226d2e99
4
+ data.tar.gz: 32d02d444ef3553df02c2764073b1d9f5cbdabea272ee1d2b3eb9e80a5a29434
5
5
  SHA512:
6
- metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
7
- data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
6
+ metadata.gz: 56d89149672219eb082eb188d2d79fde2d97931efe8295983719a5693c5b96c07e607a309d03690e8a739a2a3018f73c7d0fbd18da6003cbafa7969633890b2e
7
+ data.tar.gz: f37a6061b4b318cb19423ab4de961a98b625d1e17e6f85eb78524794a967a2f5dda4bceaa5c5414b2df33a359c9c0fd5a8e8a7bfe22c4f872e41a9e9c5f46d48
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.4.2"
4
+ VERSION = "0.4.4"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -31,7 +31,7 @@ module CoolCrawler
31
31
  end
32
32
 
33
33
  def run
34
- until queue.empty?
34
+ until queue.empty? || @visited_pages >= @max_pages
35
35
  send_crawlers
36
36
  sleep(delay)
37
37
  end
@@ -43,7 +43,10 @@ module CoolCrawler
43
43
 
44
44
  def send_crawlers
45
45
  pages = []
46
- pages << queue.pop until queue.empty? || pages.size >= max_connections
46
+ until queue.empty? || pages.size >= max_connections || @visited_pages >= @max_pages
47
+ pages << queue.pop
48
+ @visited_pages += 1
49
+ end
47
50
  Async do
48
51
  internet = Async::HTTP::Internet.new
49
52
  barrier = Async::Barrier.new
@@ -51,12 +54,14 @@ module CoolCrawler
51
54
  pages.each do |page|
52
55
  barrier.async do
53
56
  response = internet.get URI.join(@site, page).to_s
54
- body = response.read
55
- links = gather_links_uri(body, URI.join(uri, page))
56
- after(page, links, body)
57
+ body = Nokogiri::HTML(response.read)
58
+ body.search('//img').remove
59
+ body.search('//style').remove
60
+ body.search('//script').remove
61
+ links = gather_links_uri(body.to_s, URI.join(uri, page))
62
+ after(page, links, body.to_s)
57
63
  links.each do |link|
58
64
  enqueue(link)
59
- @visited_pages += 1
60
65
  add_to_visited(link)
61
66
  end
62
67
  end
@@ -74,7 +79,7 @@ module CoolCrawler
74
79
  next if a["href"].nil?
75
80
  uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
76
81
  begin
77
- links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
82
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
78
83
  rescue
79
84
  # do nothing
80
85
  end
@@ -107,9 +112,8 @@ module CoolCrawler
107
112
  end
108
113
 
109
114
  def enqueue(path)
110
- unless visited.include?(path) or @visited_pages > @max_pages
115
+ unless visited.include?(path)
111
116
  queue << path
112
- p queue.size
113
117
  end
114
118
  end
115
119
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-31 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec