coolCrawler 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ce1b157940f577ce7346cc38f382804ef23475649d46e112c52b2695c1cc35d
4
- data.tar.gz: b29f27c7878b1360a1a2f60ce76f67bb751917dbd9b9870ea0922be22a7cb68d
3
+ metadata.gz: ce0e8b8ba2b209fbe9060b3d00744aaaf574c2b2ada771f1fe4f4bc63665bc4f
4
+ data.tar.gz: 89f98f2b07274e0d3f4217aa9303899d20a7395ebe3a133980f920087aaa48e4
5
5
  SHA512:
6
- metadata.gz: e03f00079f407af43f304aa604445a03da5e4bab441fdd5080dfb6531f69fe14781a81434a7acf64a135f5da8c03804f2aac2af3c0cea0fee0f1d25f9546eef1
7
- data.tar.gz: 637eb3cff98f1357ccd3ab5095f30c39a120b1348a020083e270ad11ca877e71fa232c1a7bd13f317f138ff7907ad137788812beb25e52e0d7cd4e318c6c6767
6
+ metadata.gz: 4612e0d29277b25c6e0621efc2b4cfaa0f2d5dbabd94445beef520a3b0d912c78942b73f80731d1394e5a5f9ef5aab51d274188f758bd099a680e2b45297be0f
7
+ data.tar.gz: 6c06f1eb22dfafbccc8dde7ce245c4d9ed7777ff575742c12a07c88cce1d3bde9b97efc18761bca1a4f2f6d86432cbf74fdbfb8f5ef9b23c69524129449a8236
data/Gemfile.lock CHANGED
@@ -1,104 +1,102 @@
1
- PATH
2
- remote: .
3
- specs:
4
- coolCrawler (0.1.0)
5
- mechanize
6
- nokogiri (~> 1.13)
7
- open-uri
8
- uri
9
-
10
- GEM
11
- remote: https://rubygems.org/
12
- specs:
13
- addressable (2.8.1)
14
- public_suffix (>= 2.0.2, < 6.0)
15
- ast (2.4.2)
16
- connection_pool (2.3.0)
17
- date (3.2.2)
18
- diff-lcs (1.5.0)
19
- domain_name (0.5.20190701)
20
- unf (>= 0.0.5, < 1.0.0)
21
- http-cookie (1.0.5)
22
- domain_name (~> 0.5)
23
- json (2.6.2)
24
- mechanize (2.8.5)
25
- addressable (~> 2.8)
26
- domain_name (~> 0.5, >= 0.5.20190701)
27
- http-cookie (~> 1.0, >= 1.0.3)
28
- mime-types (~> 3.0)
29
- net-http-digest_auth (~> 1.4, >= 1.4.1)
30
- net-http-persistent (>= 2.5.2, < 5.0.dev)
31
- nokogiri (~> 1.11, >= 1.11.2)
32
- rubyntlm (~> 0.6, >= 0.6.3)
33
- webrick (~> 1.7)
34
- webrobots (~> 0.1.2)
35
- mime-types (3.4.1)
36
- mime-types-data (~> 3.2015)
37
- mime-types-data (3.2022.0105)
38
- net-http-digest_auth (1.4.1)
39
- net-http-persistent (4.0.1)
40
- connection_pool (~> 2.2)
41
- nokogiri (1.13.8-x64-mingw-ucrt)
42
- racc (~> 1.4)
43
- open-uri (0.2.0)
44
- stringio
45
- time
46
- uri
47
- parallel (1.22.1)
48
- parser (3.1.2.1)
49
- ast (~> 2.4.1)
50
- public_suffix (5.0.0)
51
- racc (1.6.0)
52
- rainbow (3.1.1)
53
- rake (13.0.6)
54
- regexp_parser (2.5.0)
55
- rexml (3.2.5)
56
- rspec (3.11.0)
57
- rspec-core (~> 3.11.0)
58
- rspec-expectations (~> 3.11.0)
59
- rspec-mocks (~> 3.11.0)
60
- rspec-core (3.11.0)
61
- rspec-support (~> 3.11.0)
62
- rspec-expectations (3.11.1)
63
- diff-lcs (>= 1.2.0, < 2.0)
64
- rspec-support (~> 3.11.0)
65
- rspec-mocks (3.11.1)
66
- diff-lcs (>= 1.2.0, < 2.0)
67
- rspec-support (~> 3.11.0)
68
- rspec-support (3.11.1)
69
- rubocop (1.36.0)
70
- json (~> 2.3)
71
- parallel (~> 1.10)
72
- parser (>= 3.1.2.1)
73
- rainbow (>= 2.2.2, < 4.0)
74
- regexp_parser (>= 1.8, < 3.0)
75
- rexml (>= 3.2.5, < 4.0)
76
- rubocop-ast (>= 1.20.1, < 2.0)
77
- ruby-progressbar (~> 1.7)
78
- unicode-display_width (>= 1.4.0, < 3.0)
79
- rubocop-ast (1.21.0)
80
- parser (>= 3.1.1.0)
81
- ruby-progressbar (1.11.0)
82
- rubyntlm (0.6.3)
83
- stringio (3.0.2)
84
- time (0.2.0)
85
- date
86
- unf (0.1.4)
87
- unf_ext
88
- unf_ext (0.0.8.2-x64-mingw-ucrt)
89
- unicode-display_width (2.3.0)
90
- uri (0.11.0)
91
- webrick (1.7.0)
92
- webrobots (0.1.2)
93
-
94
- PLATFORMS
95
- x64-mingw-ucrt
96
-
97
- DEPENDENCIES
98
- coolCrawler!
99
- rake (~> 13.0)
100
- rspec (~> 3.0)
101
- rubocop (~> 1.21)
102
-
103
- BUNDLED WITH
104
- 2.3.7
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ coolCrawler (0.1.0)
5
+ async-http
6
+ nokogiri
7
+ open-uri
8
+ uri
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ ast (2.4.2)
14
+ async (2.1.0)
15
+ console (~> 1.10)
16
+ io-event (~> 1.0.0)
17
+ timers (~> 4.1)
18
+ async-http (0.59.2)
19
+ async (>= 1.25)
20
+ async-io (>= 1.28)
21
+ async-pool (>= 0.2)
22
+ protocol-http (~> 0.23.1)
23
+ protocol-http1 (~> 0.14.0)
24
+ protocol-http2 (~> 0.14.0)
25
+ traces (>= 0.4.0)
26
+ async-io (1.34.0)
27
+ async
28
+ async-pool (0.3.12)
29
+ async (>= 1.25)
30
+ console (1.15.3)
31
+ fiber-local
32
+ date (3.2.2)
33
+ diff-lcs (1.5.0)
34
+ fiber-local (1.0.0)
35
+ io-event (1.0.9)
36
+ json (2.6.2)
37
+ nokogiri (1.13.8-x86_64-linux)
38
+ racc (~> 1.4)
39
+ open-uri (0.2.0)
40
+ stringio
41
+ time
42
+ uri
43
+ parallel (1.22.1)
44
+ parser (3.1.2.1)
45
+ ast (~> 2.4.1)
46
+ protocol-hpack (1.4.2)
47
+ protocol-http (0.23.12)
48
+ protocol-http1 (0.14.6)
49
+ protocol-http (~> 0.22)
50
+ protocol-http2 (0.14.2)
51
+ protocol-hpack (~> 1.4)
52
+ protocol-http (~> 0.18)
53
+ racc (1.6.0)
54
+ rainbow (3.1.1)
55
+ rake (13.0.6)
56
+ regexp_parser (2.6.0)
57
+ rexml (3.2.5)
58
+ rspec (3.11.0)
59
+ rspec-core (~> 3.11.0)
60
+ rspec-expectations (~> 3.11.0)
61
+ rspec-mocks (~> 3.11.0)
62
+ rspec-core (3.11.0)
63
+ rspec-support (~> 3.11.0)
64
+ rspec-expectations (3.11.1)
65
+ diff-lcs (>= 1.2.0, < 2.0)
66
+ rspec-support (~> 3.11.0)
67
+ rspec-mocks (3.11.1)
68
+ diff-lcs (>= 1.2.0, < 2.0)
69
+ rspec-support (~> 3.11.0)
70
+ rspec-support (3.11.1)
71
+ rubocop (1.36.0)
72
+ json (~> 2.3)
73
+ parallel (~> 1.10)
74
+ parser (>= 3.1.2.1)
75
+ rainbow (>= 2.2.2, < 4.0)
76
+ regexp_parser (>= 1.8, < 3.0)
77
+ rexml (>= 3.2.5, < 4.0)
78
+ rubocop-ast (>= 1.20.1, < 2.0)
79
+ ruby-progressbar (~> 1.7)
80
+ unicode-display_width (>= 1.4.0, < 3.0)
81
+ rubocop-ast (1.21.0)
82
+ parser (>= 3.1.1.0)
83
+ ruby-progressbar (1.11.0)
84
+ stringio (3.0.2)
85
+ time (0.2.0)
86
+ date
87
+ timers (4.3.4)
88
+ traces (0.7.0)
89
+ unicode-display_width (2.3.0)
90
+ uri (0.11.0)
91
+
92
+ PLATFORMS
93
+ x86_64-linux
94
+
95
+ DEPENDENCIES
96
+ coolCrawler!
97
+ rake (~> 13.0)
98
+ rspec (~> 3.0)
99
+ rubocop (~> 1.21)
100
+
101
+ BUNDLED WITH
102
+ 2.3.7
data/README.md CHANGED
@@ -1,8 +1,6 @@
1
1
  # CoolCrawler
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/coolCrawler`. To experiment with that code, run `bin/console` for an interactive prompt.
4
-
5
- TODO: Delete this and the text above, and describe your gem
3
+ Cool Crawler is a light weight crawler for the purpose of that one university assignment.
6
4
 
7
5
  ## Installation
8
6
 
@@ -22,7 +20,31 @@ Or install it yourself as:
22
20
 
23
21
  ## Usage
24
22
 
25
- TODO: Write usage instructions here
23
+
24
+
25
+ ```ruby
26
+ # create a set of 10 crawlers with a delay of 0.01 seconds between each group of crawl
27
+ crawler = CoolCrawler::CrawlerServer.new("https://github.com", 10, 0.01)
28
+
29
+ # set the callback function. This will be called everytime an individual crawler finishes its crawling. page is current path the crawler is on, links is an array
30
+ # of all links found
31
+ crawler.set_callback(Proc.new {|page, links| p page, links})
32
+
33
+ # starts the crawl (ends when thare no more page in queue)
34
+ crawler.run
35
+ end
36
+ ```
37
+
38
+ ## TO-DO
39
+
40
+ ### For version 0.1.x:
41
+
42
+ * Implement method to scan and apply the rules of robots.txt
43
+ * add a way to limit the number of links in the queue
44
+ * Sleep period can be specified in the start block, but it would be good to have it supplied as configuration
45
+ * Test and adapt for concurrency
46
+
47
+ I will implement pageranking in 0.2.x
26
48
 
27
49
  ## Development
28
50
 
Binary file
data/coolCrawler.gemspec CHANGED
@@ -31,7 +31,8 @@ Gem::Specification.new do |spec|
31
31
  # Uncomment to register a new dependency of your gem
32
32
  # spec.add_dependency "example-gem", "~> 1.0"
33
33
  spec.add_development_dependency "rspec", "~> 3.11"
34
- spec.add_dependency "nokogiri", "~> 1.13"
34
+ spec.add_dependency 'async-http'
35
+ spec.add_dependency "nokogiri"
35
36
  spec.add_dependency "open-uri"
36
37
  spec.add_dependency "uri"
37
38
  # For more information and examples about making a new gem, check out our
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CoolCrawler
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/cool_crawler.rb CHANGED
@@ -1,98 +1,126 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "coolCrawler/version"
4
- require "Nokogiri"
5
- require "net/http"
6
- require "open-uri"
7
- require "uri"
4
+ require 'async'
5
+ require 'async/http/internet'
6
+ require 'async/barrier'
7
+ require 'nokogiri'
8
+
8
9
  # Module Controller
9
10
  module CoolCrawler
10
11
  class Error < StandardError; end
11
12
 
12
- # Main Crawler Class
13
- class Crawler
14
- include CoolCrawler
15
- def initialize(root)
16
- @uri = URI(root)
17
- @website = "#{@uri.scheme}://#{@uri.host}"
18
- @queue = []
19
- @visited = { @uri.path => 1 }
20
- @queue << @uri.path # root is part of the pages to crawl
13
+ # This is the class that handles the queue and async requests
14
+ class CrawlerServer
15
+
16
+ def initialize(start, max_connections, delay)
17
+ uri = URI(start)
18
+ @site = "#{uri.scheme}://#{uri.host}"
19
+ @max_connections = max_connections
20
+ @delay = delay
21
+ visited[uri.path] = 1
22
+ queue << uri.path
21
23
  end
22
24
 
23
- attr_reader :queue, :visted, :uri, :website
25
+ attr_reader :max_connections, :delay, :callback
24
26
 
25
- def http
26
- @http ||= Net::HTTP.new(@uri.host)
27
+ def set_callback(proc)
28
+ @callback=proc
27
29
  end
28
30
 
29
- def start
30
- return unless block_given?
31
+ def run
32
+ until queue.empty?
33
+ send_crawlers
34
+ sleep(delay)
35
+ end
36
+ end
31
37
 
32
- yield(self.next) until queue.empty?
38
+ def after(page, links)
39
+ callback.call(page, links) unless callback.nil?
33
40
  end
34
41
 
35
- def unvisited?(path)
36
- !@visited.include?(path)
42
+ def send_crawlers
43
+ pages = []
44
+ until queue.empty? || pages.size >= max_connections
45
+ pages << queue.pop
46
+ end
47
+ Async do
48
+ internet = Async::HTTP::Internet.new
49
+ barrier = Async::Barrier.new
50
+
51
+ pages.each do |page|
52
+ barrier.async do
53
+ response = internet.get URI.join(@site, page).to_s
54
+ links = Crawler.new(URI.join(@site, page), response.read).gather_links_uri
55
+ after(page, links)
56
+ links.each do |link|
57
+ enqueue(link)
58
+ add_to_visited(link)
59
+ end
60
+ end
61
+ end
62
+ barrier.wait
63
+ ensure
64
+ internet&.close
65
+ end
66
+ end
67
+
68
+ def queue
69
+ @queue ||= Queue.new
70
+ end
71
+
72
+ def visited
73
+ @visited ||= {}
37
74
  end
38
75
 
39
76
  def visited?(path)
40
- @visited.include?(path)
77
+ visited.include?(path)
41
78
  end
42
79
 
43
80
  def add_to_visited(path)
44
81
  if visited?(path)
45
- @visited[path] += 1
82
+ visited[path] += 1
46
83
  else
47
- @visited[path] = 1
84
+ visited[path] = 1
48
85
  end
49
86
  end
50
87
 
51
- def all_links
52
- @visited.sort_by { |_k, v| v }
88
+ def sorted_visited
89
+ visited.sort_by { |_k, v| v }
53
90
  end
54
91
 
55
- private
92
+ def enqueue(path)
93
+ queue << path unless visited.include?(path)
94
+ end
56
95
 
57
- def gather_links_uri(doc)
58
- doc.xpath("//a").each do |a|
59
- next if a["href"].nil?
96
+ end
60
97
 
61
- uri_a = URI(a["href"])
62
- yield URI.join(@current, uri_a).path if (uri_a.host == @uri.host || uri_a.host.nil?) && uri_a.path
63
- end
64
- end
65
98
 
66
- def get_body(uri)
67
- http.read_timeout = 5
68
- http.max_retries = 5
69
- res = http.get(uri.path)
70
- res.body
99
+ # This is the individual crawler
100
+ class Crawler
101
+ include CoolCrawler
102
+ def initialize(current, response)
103
+ @current = URI(current)
104
+ @response = response
71
105
  end
72
106
 
73
- def enqueue(path)
74
- queue << path unless @visited.include?(path)
75
- end
107
+ attr_reader :current, :response
76
108
 
77
- # Gathers the a['href'] links on a page and enqueues them
78
- def add_to_queue(path)
109
+ def gather_links_uri
79
110
  links = []
80
- @current = URI.join(@website, path)
81
- doc = Nokogiri::HTML(get_body(@current))
82
- gather_links_uri(doc) do |link_uri|
83
- enqueue(link_uri)
84
- add_to_visited(link_uri)
85
- links << link_uri
111
+ doc = Nokogiri::HTML(response)
112
+ doc.xpath("//a").each do |a|
113
+ next if a["href"].nil?
114
+ uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
115
+ begin
116
+ link = URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
117
+ links << URI.join(current, uri_a).path if (uri_a.host == current.host || uri_a.host.nil?) && uri_a.path
118
+ rescue
119
+ # do nothing
120
+ end
86
121
  end
87
- [path, links]
88
- end
89
-
90
- def visit(path)
91
- add_to_queue(path)
92
- end
93
-
94
- def next
95
- visit(@queue.pop)
122
+ links
96
123
  end
124
+
97
125
  end
98
126
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coolCrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - William Wright
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-09-29 00:00:00.000000000 Z
11
+ date: 2022-10-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -24,20 +24,34 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '3.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: async-http
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: nokogiri
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
- - - "~>"
45
+ - - ">="
32
46
  - !ruby/object:Gem::Version
33
- version: '1.13'
47
+ version: '0'
34
48
  type: :runtime
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
- - - "~>"
52
+ - - ">="
39
53
  - !ruby/object:Gem::Version
40
- version: '1.13'
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: open-uri
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -82,6 +96,7 @@ files:
82
96
  - LICENSE.txt
83
97
  - README.md
84
98
  - Rakefile
99
+ - coolCrawler-0.2.0.gem
85
100
  - coolCrawler.gemspec
86
101
  - lib/coolCrawler/version.rb
87
102
  - lib/cool_crawler.rb