coolCrawler 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/README.md +1 -0
- data/lib/coolCrawler/version.rb +1 -1
- data/lib/cool_crawler.rb +21 -38
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e927853d47dec36f11557336b9a7c1f2d986b1a9933b78878f2cef18f585cf54
|
4
|
+
data.tar.gz: 67a189af1c21a32650b4cd58573aac781cf2f2a43ce6b20a88eb04a3298fbb79
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f3e9b7ff0b17807160670456b7d1bf49079760accef770620e0689e66234f6737dd90c973178f1bc600d22c2bc114e95d198fd756a656f006c2d9f35e2b7167
|
7
|
+
data.tar.gz: 2e03b27c2142c7eb389df2e4955070924c843e92d2a167e6aa1b5411e2698bcfbc551999542575b46eda561a949eaca2a3445a1e7db2ba6304700b58ed8a35b7
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
data/lib/coolCrawler/version.rb
CHANGED
data/lib/cool_crawler.rb
CHANGED
@@ -11,10 +11,10 @@ module CoolCrawler
|
|
11
11
|
class Error < StandardError; end
|
12
12
|
|
13
13
|
# This is the class that handles the queue and async requests
|
14
|
-
class
|
14
|
+
class CrawlerPool
|
15
15
|
|
16
16
|
def initialize(start, max_connections, delay)
|
17
|
-
uri = URI(start)
|
17
|
+
@uri = URI(start)
|
18
18
|
@site = "#{uri.scheme}://#{uri.host}"
|
19
19
|
@max_connections = max_connections
|
20
20
|
@delay = delay
|
@@ -22,7 +22,7 @@ module CoolCrawler
|
|
22
22
|
queue << uri.path
|
23
23
|
end
|
24
24
|
|
25
|
-
attr_reader :max_connections, :delay, :callback
|
25
|
+
attr_reader :max_connections, :uri, :delay, :callback, :site
|
26
26
|
|
27
27
|
def set_callback(proc)
|
28
28
|
@callback=proc
|
@@ -41,17 +41,15 @@ module CoolCrawler
|
|
41
41
|
|
42
42
|
def send_crawlers
|
43
43
|
pages = []
|
44
|
-
until queue.empty? || pages.size >= max_connections
|
45
|
-
pages << queue.pop
|
46
|
-
end
|
44
|
+
pages << queue.pop until queue.empty? || pages.size >= max_connections
|
47
45
|
Async do
|
48
46
|
internet = Async::HTTP::Internet.new
|
49
47
|
barrier = Async::Barrier.new
|
50
48
|
|
51
49
|
pages.each do |page|
|
52
50
|
barrier.async do
|
53
|
-
response = internet.get URI.join(@site, page)
|
54
|
-
links =
|
51
|
+
response = internet.get URI.join(@site, page).to_s
|
52
|
+
links = gather_links_uri(response.read, URI.join(uri, page))
|
55
53
|
after(page, links)
|
56
54
|
links.each do |link|
|
57
55
|
enqueue(link)
|
@@ -65,6 +63,21 @@ module CoolCrawler
|
|
65
63
|
end
|
66
64
|
end
|
67
65
|
|
66
|
+
def gather_links_uri(body, page)
|
67
|
+
links = []
|
68
|
+
doc = Nokogiri::HTML(body)
|
69
|
+
doc.xpath("//a").each do |a|
|
70
|
+
next if a["href"].nil?
|
71
|
+
uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
|
72
|
+
begin
|
73
|
+
links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
|
74
|
+
rescue
|
75
|
+
# do nothing
|
76
|
+
end
|
77
|
+
end
|
78
|
+
links
|
79
|
+
end
|
80
|
+
|
68
81
|
def queue
|
69
82
|
@queue ||= Queue.new
|
70
83
|
end
|
@@ -92,35 +105,5 @@ module CoolCrawler
|
|
92
105
|
def enqueue(path)
|
93
106
|
queue << path unless visited.include?(path)
|
94
107
|
end
|
95
|
-
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
# This is the individual crawler
|
100
|
-
class Crawler
|
101
|
-
include CoolCrawler
|
102
|
-
def initialize(current, response)
|
103
|
-
@current = URI(current)
|
104
|
-
@response = response
|
105
|
-
end
|
106
|
-
|
107
|
-
attr_reader :current, :response
|
108
|
-
|
109
|
-
def gather_links_uri
|
110
|
-
links = []
|
111
|
-
doc = Nokogiri::HTML(response)
|
112
|
-
doc.xpath("//a").each do |a|
|
113
|
-
next if a["href"].nil?
|
114
|
-
uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
|
115
|
-
begin
|
116
|
-
link = URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
|
117
|
-
links << URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
|
118
|
-
rescue
|
119
|
-
# do nothing
|
120
|
-
end
|
121
|
-
end
|
122
|
-
links
|
123
|
-
end
|
124
|
-
|
125
108
|
end
|
126
109
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: coolCrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- William Wright
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-10-
|
11
|
+
date: 2022-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|