coolCrawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
4
- data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
3
+ metadata.gz: a8c2944180ee7c5d7f1241fd62bf5e89973a2fca742c51567e1550f14f95f70a
4
+ data.tar.gz: 6635f39af2babaead932e94cae419738d7bd61e0ecc2f23216cb6102607904bf
5
5
  SHA512:
6
- metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
7
- data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
6
+ metadata.gz: 4f56da51ab47060e7d58b3ac469ca15501b2b457a936b634b361afa0f832a27130c74169d7c1da4f3a188f56db58335c59325de261990ee02bbdbd1431ad24da
7
+ data.tar.gz: 77dc4e5cdb5d0098ba8f91279bc4d658c653130313fbb49de52c3e2c2b707c4501fcf77a31f2a6a54b0b48ca12e54ab674423ab377e22e043a6e6cce0093a6df
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.4.2"
4
+ VERSION = "0.4.3"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -51,12 +51,14 @@ module CoolCrawler
51
51
  pages.each do |page|
52
52
  barrier.async do
53
53
  response = internet.get URI.join(@site, page).to_s
54
- body = response.read
55
- links = gather_links_uri(body, URI.join(uri, page))
56
- after(page, links, body)
54
+ body = Nokogiri::HTML(response.read)
55
+ body.search('//img').remove
56
+ body.search('//style').remove
57
+ body.search('//script').remove
58
+ links = gather_links_uri(body.to_s, URI.join(uri, page))
59
+ after(page, links, body.to_s)
57
60
  links.each do |link|
58
61
  enqueue(link)
59
- @visited_pages += 1
60
62
  add_to_visited(link)
61
63
  end
62
64
  end
@@ -74,7 +76,9 @@ module CoolCrawler
74
76
  next if a["href"].nil?
75
77
  uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
76
78
  begin
77
- links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
79
+ if @visited_pages <= @max_pages
80
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
81
+ end
78
82
  rescue
79
83
  # do nothing
80
84
  end
@@ -100,6 +104,7 @@ module CoolCrawler
100
104
  else
101
105
  visited[path] = 1
102
106
  end
107
+ @visited_pages += 1
103
108
  end
104
109
 
105
110
  def sorted_visited
@@ -107,9 +112,8 @@ module CoolCrawler
107
112
  end
108
113
 
109
114
  def enqueue(path)
110
- unless visited.include?(path) or @visited_pages > @max_pages
115
+ unless visited.include?(path)
111
116
  queue << path
112
- p queue.size
113
117
  end
114
118
  end
115
119
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-31 00:00:00.000000000 Z
11
+ date: 2022-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec