coolCrawler 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2158b8a1d5ceeaeb6d0d7813e6ae5020a49eaffd620453c8490e3e64749fecbb
4
- data.tar.gz: 76fcf09ba3252a94a748b6a16ed43787b7fd28f0806aff430a58c2aa884d6266
3
+ metadata.gz: e927853d47dec36f11557336b9a7c1f2d986b1a9933b78878f2cef18f585cf54
4
+ data.tar.gz: 67a189af1c21a32650b4cd58573aac781cf2f2a43ce6b20a88eb04a3298fbb79
5
5
  SHA512:
6
- metadata.gz: 3a51a6c79a067d65e2efb064b0e7aedc9740d5137143cef98d5007d4dfff9a8cd4fde144e8ebca36becf418f01039dfebe037ee1e9f0f658afa624c4ae9f8336
7
- data.tar.gz: 3bd9259785c29120037066739c7c55b4b6b011ac7aa252b9f27a82516c8340e90825dd7d868f54e9c49cd7b41ed4804892b1494316adb9bfd3629fa34cbbfffc
6
+ metadata.gz: 6f3e9b7ff0b17807160670456b7d1bf49079760accef770620e0689e66234f6737dd90c973178f1bc600d22c2bc114e95d198fd756a656f006c2d9f35e2b7167
7
+ data.tar.gz: 2e03b27c2142c7eb389df2e4955070924c843e92d2a167e6aa1b5411e2698bcfbc551999542575b46eda561a949eaca2a3445a1e7db2ba6304700b58ed8a35b7
data/CHANGELOG.md CHANGED
@@ -1,4 +1,9 @@
1
- ## [Unreleased]
1
+ ## [0.3.0] - 2022-09-24
2
+
3
+ - Removed Crawler class
4
+ - CrawlerServer has been renamed to CrawlerPool
5
+ - added attr_reader to get site.
6
+ - Crawler still ignores outgoing links for now
2
7
 
3
8
  ## [0.1.0] - 2022-09-24
4
9
 
data/README.md CHANGED
@@ -23,6 +23,7 @@ Or install it yourself as:
23
23
 
24
24
 
25
25
  ```ruby
26
+ require 'cool_crawler'
26
27
  # create a set of 10 crawlers with a delay of 0.01 seconds between each group of crawl
27
28
  crawler = CoolCrawler::CrawlerServer.new("https://github.com", 10, 0.01)
28
29
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -11,10 +11,10 @@ module CoolCrawler
11
11
  class Error < StandardError; end
12
12
 
13
13
  # This is the class that handles the queue and async requests
14
- class CrawlerServer
14
+ class CrawlerPool
15
15
 
16
16
  def initialize(start, max_connections, delay)
17
- uri = URI(start)
17
+ @uri = URI(start)
18
18
  @site = "#{uri.scheme}://#{uri.host}"
19
19
  @max_connections = max_connections
20
20
  @delay = delay
@@ -22,7 +22,7 @@ module CoolCrawler
22
22
  queue << uri.path
23
23
  end
24
24
 
25
- attr_reader :max_connections, :delay, :callback
25
+ attr_reader :max_connections, :uri, :delay, :callback, :site
26
26
 
27
27
  def set_callback(proc)
28
28
  @callback=proc
@@ -41,17 +41,15 @@ module CoolCrawler
41
41
 
42
42
  def send_crawlers
43
43
  pages = []
44
- until queue.empty? || pages.size >= max_connections
45
- pages << queue.pop
46
- end
44
+ pages << queue.pop until queue.empty? || pages.size >= max_connections
47
45
  Async do
48
46
  internet = Async::HTTP::Internet.new
49
47
  barrier = Async::Barrier.new
50
48
 
51
49
  pages.each do |page|
52
50
  barrier.async do
53
- response = internet.get URI.join(@site, page)
54
- links = Crawler.new(URI.join(@site, page), response.read).gather_links_uri
51
+ response = internet.get URI.join(@site, page).to_s
52
+ links = gather_links_uri(response.read, URI.join(uri, page))
55
53
  after(page, links)
56
54
  links.each do |link|
57
55
  enqueue(link)
@@ -65,6 +63,21 @@ module CoolCrawler
65
63
  end
66
64
  end
67
65
 
66
+ def gather_links_uri(body, page)
67
+ links = []
68
+ doc = Nokogiri::HTML(body)
69
+ doc.xpath("//a").each do |a|
70
+ next if a["href"].nil?
71
+ uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
72
+ begin
73
+ links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
74
+ rescue
75
+ # do nothing
76
+ end
77
+ end
78
+ links
79
+ end
80
+
68
81
  def queue
69
82
  @queue ||= Queue.new
70
83
  end
@@ -92,35 +105,5 @@ module CoolCrawler
92
105
  def enqueue(path)
93
106
  queue << path unless visited.include?(path)
94
107
  end
95
-
96
- end
97
-
98
-
99
- # This is the individual crawler
100
- class Crawler
101
- include CoolCrawler
102
- def initialize(current, response)
103
- @current = URI(current)
104
- @response = response
105
- end
106
-
107
- attr_reader :current, :response
108
-
109
- def gather_links_uri
110
- links = []
111
- doc = Nokogiri::HTML(response)
112
- doc.xpath("//a").each do |a|
113
- next if a["href"].nil?
114
- uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
115
- begin
116
- link = URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
117
- links << URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
118
- rescue
119
- # do nothing
120
- end
121
- end
122
- links
123
- end
124
-
125
108
  end
126
109
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-10-03 00:00:00.000000000 Z
11
+ date: 2022-10-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec