coolCrawler 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
4
- data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
3
+ metadata.gz: 79059c4a0ef1c026082b3ef2b25ee13c2310984a2049be4345f0345b226d2e99
4
+ data.tar.gz: 32d02d444ef3553df02c2764073b1d9f5cbdabea272ee1d2b3eb9e80a5a29434
5
5
  SHA512:
6
- metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
7
- data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
6
+ metadata.gz: 56d89149672219eb082eb188d2d79fde2d97931efe8295983719a5693c5b96c07e607a309d03690e8a739a2a3018f73c7d0fbd18da6003cbafa7969633890b2e
7
+ data.tar.gz: f37a6061b4b318cb19423ab4de961a98b625d1e17e6f85eb78524794a967a2f5dda4bceaa5c5414b2df33a359c9c0fd5a8e8a7bfe22c4f872e41a9e9c5f46d48
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.4.2"
4
+ VERSION = "0.4.4"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -31,7 +31,7 @@ module CoolCrawler
31
31
  end
32
32
 
33
33
  def run
34
- until queue.empty?
34
+ until queue.empty? || @visited_pages >= @max_pages
35
35
  send_crawlers
36
36
  sleep(delay)
37
37
  end
@@ -43,7 +43,10 @@ module CoolCrawler
43
43
 
44
44
  def send_crawlers
45
45
  pages = []
46
- pages << queue.pop until queue.empty? || pages.size >= max_connections
46
+ until queue.empty? || pages.size >= max_connections || @visited_pages >= @max_pages
47
+ pages << queue.pop
48
+ @visited_pages += 1
49
+ end
47
50
  Async do
48
51
  internet = Async::HTTP::Internet.new
49
52
  barrier = Async::Barrier.new
@@ -51,12 +54,14 @@ module CoolCrawler
51
54
  pages.each do |page|
52
55
  barrier.async do
53
56
  response = internet.get URI.join(@site, page).to_s
54
- body = response.read
55
- links = gather_links_uri(body, URI.join(uri, page))
56
- after(page, links, body)
57
+ body = Nokogiri::HTML(response.read)
58
+ body.search('//img').remove
59
+ body.search('//style').remove
60
+ body.search('//script').remove
61
+ links = gather_links_uri(body.to_s, URI.join(uri, page))
62
+ after(page, links, body.to_s)
57
63
  links.each do |link|
58
64
  enqueue(link)
59
- @visited_pages += 1
60
65
  add_to_visited(link)
61
66
  end
62
67
  end
@@ -74,7 +79,7 @@ module CoolCrawler
74
79
  next if a["href"].nil?
75
80
  uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
76
81
  begin
77
- links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
82
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
78
83
  rescue
79
84
  # do nothing
80
85
  end
@@ -107,9 +112,8 @@ module CoolCrawler
107
112
  end
108
113
 
109
114
  def enqueue(path)
110
- unless visited.include?(path) or @visited_pages > @max_pages
115
+ unless visited.include?(path)
111
116
  queue << path
112
- p queue.size
113
117
  end
114
118
  end
115
119
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-31 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec