coolCrawler 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
4
- data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
3
+ metadata.gz: a8c2944180ee7c5d7f1241fd62bf5e89973a2fca742c51567e1550f14f95f70a
4
+ data.tar.gz: 6635f39af2babaead932e94cae419738d7bd61e0ecc2f23216cb6102607904bf
5
5
  SHA512:
6
- metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
7
- data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
6
+ metadata.gz: 4f56da51ab47060e7d58b3ac469ca15501b2b457a936b634b361afa0f832a27130c74169d7c1da4f3a188f56db58335c59325de261990ee02bbdbd1431ad24da
7
+ data.tar.gz: 77dc4e5cdb5d0098ba8f91279bc4d658c653130313fbb49de52c3e2c2b707c4501fcf77a31f2a6a54b0b48ca12e54ab674423ab377e22e043a6e6cce0093a6df
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.4.2"
4
+ VERSION = "0.4.3"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -51,12 +51,14 @@ module CoolCrawler
51
51
  pages.each do |page|
52
52
  barrier.async do
53
53
  response = internet.get URI.join(@site, page).to_s
54
- body = response.read
55
- links = gather_links_uri(body, URI.join(uri, page))
56
- after(page, links, body)
54
+ body = Nokogiri::HTML(response.read)
55
+ body.search('//img').remove
56
+ body.search('//style').remove
57
+ body.search('//script').remove
58
+ links = gather_links_uri(body.to_s, URI.join(uri, page))
59
+ after(page, links, body.to_s)
57
60
  links.each do |link|
58
61
  enqueue(link)
59
- @visited_pages += 1
60
62
  add_to_visited(link)
61
63
  end
62
64
  end
@@ -74,7 +76,9 @@ module CoolCrawler
74
76
  next if a["href"].nil?
75
77
  uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
76
78
  begin
77
- links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
79
+ if @visited_pages <= @max_pages
80
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
81
+ end
78
82
  rescue
79
83
  # do nothing
80
84
  end
@@ -100,6 +104,7 @@ module CoolCrawler
100
104
  else
101
105
  visited[path] = 1
102
106
  end
107
+ @visited_pages += 1
103
108
  end
104
109
 
105
110
  def sorted_visited
@@ -107,9 +112,8 @@ module CoolCrawler
107
112
  end
108
113
 
109
114
  def enqueue(path)
110
- unless visited.include?(path) or @visited_pages > @max_pages
115
+ unless visited.include?(path)
111
116
  queue << path
112
- p queue.size
113
117
  end
114
118
  end
115
119
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-31 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec