coolCrawler 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/coolCrawler/version.rb +1 -1
- data/lib/cool_crawler.rb +11 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a8c2944180ee7c5d7f1241fd62bf5e89973a2fca742c51567e1550f14f95f70a
|
4
|
+
data.tar.gz: 6635f39af2babaead932e94cae419738d7bd61e0ecc2f23216cb6102607904bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f56da51ab47060e7d58b3ac469ca15501b2b457a936b634b361afa0f832a27130c74169d7c1da4f3a188f56db58335c59325de261990ee02bbdbd1431ad24da
|
7
|
+
data.tar.gz: 77dc4e5cdb5d0098ba8f91279bc4d658c653130313fbb49de52c3e2c2b707c4501fcf77a31f2a6a54b0b48ca12e54ab674423ab377e22e043a6e6cce0093a6df
|
data/lib/coolCrawler/version.rb
CHANGED
data/lib/cool_crawler.rb
CHANGED
@@ -51,12 +51,14 @@ module CoolCrawler
|
|
51
51
|
pages.each do |page|
|
52
52
|
barrier.async do
|
53
53
|
response = internet.get URI.join(@site, page).to_s
|
54
|
-
body = response.read
|
55
|
-
|
56
|
-
|
54
|
+
body = Nokogiri::HTML(response.read)
|
55
|
+
body.search('//img').remove
|
56
|
+
body.search('//style').remove
|
57
|
+
body.search('//script').remove
|
58
|
+
links = gather_links_uri(body.to_s, URI.join(uri, page))
|
59
|
+
after(page, links, body.to_s)
|
57
60
|
links.each do |link|
|
58
61
|
enqueue(link)
|
59
|
-
@visited_pages += 1
|
60
62
|
add_to_visited(link)
|
61
63
|
end
|
62
64
|
end
|
@@ -74,7 +76,9 @@ module CoolCrawler
|
|
74
76
|
next if a["href"].nil?
|
75
77
|
uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
|
76
78
|
begin
|
77
|
-
|
79
|
+
if @visited_pages <= @max_pages
|
80
|
+
links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
|
81
|
+
end
|
78
82
|
rescue
|
79
83
|
# do nothing
|
80
84
|
end
|
@@ -100,6 +104,7 @@ module CoolCrawler
|
|
100
104
|
else
|
101
105
|
visited[path] = 1
|
102
106
|
end
|
107
|
+
@visited_pages += 1
|
103
108
|
end
|
104
109
|
|
105
110
|
def sorted_visited
|
@@ -107,9 +112,8 @@ module CoolCrawler
|
|
107
112
|
end
|
108
113
|
|
109
114
|
def enqueue(path)
|
110
|
-
unless visited.include?(path)
|
115
|
+
unless visited.include?(path)
|
111
116
|
queue << path
|
112
|
-
p queue.size
|
113
117
|
end
|
114
118
|
end
|
115
119
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coolCrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- William Wright
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|