coolCrawler 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2158b8a1d5ceeaeb6d0d7813e6ae5020a49eaffd620453c8490e3e64749fecbb
4
- data.tar.gz: 76fcf09ba3252a94a748b6a16ed43787b7fd28f0806aff430a58c2aa884d6266
3
+ metadata.gz: e927853d47dec36f11557336b9a7c1f2d986b1a9933b78878f2cef18f585cf54
4
+ data.tar.gz: 67a189af1c21a32650b4cd58573aac781cf2f2a43ce6b20a88eb04a3298fbb79
5
5
  SHA512:
6
- metadata.gz: 3a51a6c79a067d65e2efb064b0e7aedc9740d5137143cef98d5007d4dfff9a8cd4fde144e8ebca36becf418f01039dfebe037ee1e9f0f658afa624c4ae9f8336
7
- data.tar.gz: 3bd9259785c29120037066739c7c55b4b6b011ac7aa252b9f27a82516c8340e90825dd7d868f54e9c49cd7b41ed4804892b1494316adb9bfd3629fa34cbbfffc
6
+ metadata.gz: 6f3e9b7ff0b17807160670456b7d1bf49079760accef770620e0689e66234f6737dd90c973178f1bc600d22c2bc114e95d198fd756a656f006c2d9f35e2b7167
7
+ data.tar.gz: 2e03b27c2142c7eb389df2e4955070924c843e92d2a167e6aa1b5411e2698bcfbc551999542575b46eda561a949eaca2a3445a1e7db2ba6304700b58ed8a35b7
data/CHANGELOG.md CHANGED
@@ -1,4 +1,9 @@
1
- ## [Unreleased]
1
+ ## [0.3.0] - 2022-09-24
2
+
3
+ - Removed Crawler class
4
+ - CrawlerServer has been renamed to CrawlerPool
5
+ - added attr_reader to get site.
6
+ - Crawler still ignores outgoing links for now
2
7
 
3
8
  ## [0.1.0] - 2022-09-24
4
9
 
data/README.md CHANGED
@@ -23,6 +23,7 @@ Or install it yourself as:
23
23
 
24
24
 
25
25
  ```ruby
26
+ require 'cool_crawler'
26
27
  # create a set of 10 crawlers with a delay of 0.01 seconds between each group of crawl
27
28
  crawler = CoolCrawler::CrawlerServer.new("https://github.com", 10, 0.01)
28
29
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -11,10 +11,10 @@ module CoolCrawler
11
11
  class Error < StandardError; end
12
12
 
13
13
  # This is the class that handles the queue and async requests
14
- class CrawlerServer
14
+ class CrawlerPool
15
15
 
16
16
  def initialize(start, max_connections, delay)
17
- uri = URI(start)
17
+ @uri = URI(start)
18
18
  @site = "#{uri.scheme}://#{uri.host}"
19
19
  @max_connections = max_connections
20
20
  @delay = delay
@@ -22,7 +22,7 @@ module CoolCrawler
22
22
  queue << uri.path
23
23
  end
24
24
 
25
- attr_reader :max_connections, :delay, :callback
25
+ attr_reader :max_connections, :uri, :delay, :callback, :site
26
26
 
27
27
  def set_callback(proc)
28
28
  @callback=proc
@@ -41,17 +41,15 @@ module CoolCrawler
41
41
 
42
42
  def send_crawlers
43
43
  pages = []
44
- until queue.empty? || pages.size >= max_connections
45
- pages << queue.pop
46
- end
44
+ pages << queue.pop until queue.empty? || pages.size >= max_connections
47
45
  Async do
48
46
  internet = Async::HTTP::Internet.new
49
47
  barrier = Async::Barrier.new
50
48
 
51
49
  pages.each do |page|
52
50
  barrier.async do
53
- response = internet.get URI.join(@site, page)
54
- links = Crawler.new(URI.join(@site, page), response.read).gather_links_uri
51
+ response = internet.get URI.join(@site, page).to_s
52
+ links = gather_links_uri(response.read, URI.join(uri, page))
55
53
  after(page, links)
56
54
  links.each do |link|
57
55
  enqueue(link)
@@ -65,6 +63,21 @@ module CoolCrawler
65
63
  end
66
64
  end
67
65
 
66
+ def gather_links_uri(body, page)
67
+ links = []
68
+ doc = Nokogiri::HTML(body)
69
+ doc.xpath("//a").each do |a|
70
+ next if a["href"].nil?
71
+ uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
72
+ begin
73
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
74
+ rescue
75
+ # do nothing
76
+ end
77
+ end
78
+ links
79
+ end
80
+
68
81
  def queue
69
82
  @queue ||= Queue.new
70
83
  end
@@ -92,35 +105,5 @@ module CoolCrawler
92
105
  def enqueue(path)
93
106
  queue << path unless visited.include?(path)
94
107
  end
95
-
96
- end
97
-
98
-
99
- # This is the individual crawler
100
- class Crawler
101
- include CoolCrawler
102
- def initialize(current, response)
103
- @current = URI(current)
104
- @response = response
105
- end
106
-
107
- attr_reader :current, :response
108
-
109
- def gather_links_uri
110
- links = []
111
- doc = Nokogiri::HTML(response)
112
- doc.xpath("//a").each do |a|
113
- next if a["href"].nil?
114
- uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
115
- begin
116
- link = URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
117
- links << URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
118
- rescue
119
- # do nothing
120
- end
121
- end
122
- links
123
- end
124
-
125
108
  end
126
109
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-03 00:00:00.000000000 Z
11
+ date: 2022-10-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec