vessel 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44fb472d4afaf916edc97894dcc39cf8b6bfbf3f8f1f0b2e8a47f495482b1bd9
4
- data.tar.gz: 36af4cd9021bd410bf1988c01f97d98df4e5646f5a56416004869fe643403672
3
+ metadata.gz: 291c528925e5f6990b8223762c00a996cf8435325b52048c1c0338fc873d1a42
4
+ data.tar.gz: 5c845e2c6b19adea7742bbad3a031fe0ad5f09a3a0f8ceff248970cc2bb74fcb
5
5
  SHA512:
6
- metadata.gz: bda3863083cdce0e8011675a0e83a583d626e81ab713803a54c5056f922d4822b069dacd8d4e5f0079d4f8625a172f7f9d30d4e3586439137af088ac0911201e
7
- data.tar.gz: 205b2f54fa17283daf50d0fdaa96e67f5dec4bed2c69ccc740433c90ecefaa9c4b1e13740cb703ac56cfe8c92b2df9da436fee94fc7937242465a33e91a088f5
6
+ metadata.gz: 77d65c60e1541c1ab1473b64bd7c451e3d57199fa19885bcf4e182d295321d0cdd36121cc673a9285a7113b482ec9143607d17a4017632d1d6826b5c5ad48b7e
7
+ data.tar.gz: 2a0e9bbe6de79e9ba6e7afed4d422de6b72639d91b508a921dd4bba2cf0b8e878a3de6846fca87de25e20047594b153efcf70ca3b91d373190bf976605e0bb2d
data/README.md CHANGED
@@ -97,23 +97,5 @@ To be continued
97
97
 
98
98
  ## License
99
99
 
100
- Copyright 2018-2020 Machinio
101
-
102
- Permission is hereby granted, free of charge, to any person obtaining
103
- a copy of this software and associated documentation files (the
104
- "Software"), to deal in the Software without restriction, including
105
- without limitation the rights to use, copy, modify, merge, publish,
106
- distribute, sublicense, and/or sell copies of the Software, and to
107
- permit persons to whom the Software is furnished to do so, subject to
108
- the following conditions:
109
-
110
- The above copyright notice and this permission notice shall be
111
- included in all copies or substantial portions of the Software.
112
-
113
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
114
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
115
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
116
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
117
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
118
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
119
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
100
+ The gem is available as open source under the terms of the
101
+ [MIT License](https://opensource.org/licenses/MIT).
data/lib/vessel/cargo.rb CHANGED
@@ -24,7 +24,7 @@ module Vessel
24
24
  end
25
25
 
26
26
  def start_urls(*urls)
27
- settings[:start_urls] = urls
27
+ settings[:start_urls] = urls.flatten
28
28
  end
29
29
 
30
30
  def delay(value)
@@ -35,6 +35,10 @@ module Vessel
35
35
  settings[:timeout] = value
36
36
  end
37
37
 
38
+ def headers(value)
39
+ settings[:headers] = value
40
+ end
41
+
38
42
  def threads(min: MIN_THREADS, max: MAX_THREADS)
39
43
  settings[:min_threads] = min
40
44
  settings[:max_threads] = max
@@ -44,6 +48,14 @@ module Vessel
44
48
  settings[:middleware] = classes
45
49
  end
46
50
 
51
+ def ferrum(**options)
52
+ settings[:ferrum] = options
53
+ end
54
+
55
+ def intercept(&block)
56
+ settings[:intercept] = block
57
+ end
58
+
47
59
  def settings
48
60
  @settings ||= {
49
61
  delay: DELAY,
@@ -51,6 +63,9 @@ module Vessel
51
63
  start_urls: START_URLS,
52
64
  min_threads: MIN_THREADS,
53
65
  max_threads: MAX_THREADS,
66
+ ferrum: Hash.new,
67
+ intercept: nil,
68
+ headers: nil,
54
69
  domain: name&.split('::')&.last&.downcase
55
70
  }
56
71
  end
@@ -82,5 +97,9 @@ module Vessel
82
97
  def absolute_url(relative)
83
98
  Addressable::URI.join(page.current_url, relative).to_s
84
99
  end
100
+
101
+ def current_url
102
+ Addressable::URI.parse(page.current_url)
103
+ end
85
104
  end
86
105
  end
data/lib/vessel/engine.rb CHANGED
@@ -21,25 +21,35 @@ module Vessel
21
21
 
22
22
  until @queue.closed?
23
23
  message = @queue.pop
24
+
24
25
  raise(message) if message.is_a?(Exception)
25
- handle(*message)
26
+
27
+ page, request = message
28
+ args = [request.method, request.data].compact
29
+ handle(page, args)
30
+
26
31
  @queue.close if idle?
27
32
  end
33
+
34
+ ensure
35
+ scheduler.stop
28
36
  end
29
37
 
30
- def handle(page, request)
38
+ def handle(page, args)
31
39
  crawler = @crawler_class.new(page)
32
- crawler.send(request.method) do |*args|
33
- if args.all? { |i| i.is_a?(Request) }
34
- scheduler.post(*args)
40
+ crawler.send(*args) do |*result|
41
+ if result.flatten.all? { |i| i.is_a?(Request) }
42
+ scheduler.post(*result.flatten)
35
43
  else
36
- @middleware&.call(*args)
44
+ @middleware&.call(*result)
37
45
  end
38
46
  end
39
47
  ensure
40
- page.close
48
+ page.close if page
41
49
  end
42
50
 
51
+ private
52
+
43
53
  def start_requests
44
54
  Request.build(*settings[:start_urls])
45
55
  end
@@ -4,16 +4,24 @@ require "addressable/uri"
4
4
 
5
5
  module Vessel
6
6
  class Request
7
- attr_reader :url, :uri, :method
7
+ attr_reader :url, :uri, :method, :data
8
8
 
9
9
  def self.build(*urls)
10
- urls.map { |url| new(url: url) }
10
+ urls.empty? ? [new] : urls.map { |url| new(url: url) }
11
11
  end
12
12
 
13
- def initialize(url:, method: :parse)
14
- @url = url.to_s
15
- @uri = Addressable::URI.parse(@url)
13
+ def initialize(url: nil, method: :parse, data: nil)
14
+ if url
15
+ @url = url.to_s
16
+ @uri = Addressable::URI.parse(@url)
17
+ end
18
+
16
19
  @method = method
20
+ @data = data.freeze if data
21
+ end
22
+
23
+ def stub?
24
+ !url
17
25
  end
18
26
  end
19
27
  end
@@ -6,18 +6,23 @@ require "concurrent-ruby"
6
6
  module Vessel
7
7
  class Scheduler
8
8
  extend Forwardable
9
- delegate %i[scheduled_task_count completed_task_count queue_length] => :@pool
9
+ delegate %i[scheduled_task_count completed_task_count queue_length] => :pool
10
10
 
11
- attr_reader :browser, :queue, :delay
11
+ attr_reader :browser, :queue, :delay, :headers
12
12
 
13
13
  def initialize(queue, settings)
14
14
  @queue = queue
15
- @min_threads, @max_threads, @delay =
16
- settings.values_at(:min_threads, :max_threads, :delay)
15
+ @min_threads, @max_threads, @delay, @headers =
16
+ settings.values_at(:min_threads, :max_threads, :delay, :headers)
17
17
 
18
- options = {}
18
+ options = settings[:ferrum]
19
19
  options.merge!(timeout: settings[:timeout]) if settings[:timeout]
20
20
  @browser = Ferrum::Browser.new(**options)
21
+
22
+ if settings[:intercept]
23
+ @browser.network.intercept
24
+ @browser.on(:request, &settings[:intercept])
25
+ end
21
26
  end
22
27
 
23
28
  def post(*requests)
@@ -28,6 +33,12 @@ module Vessel
28
33
  end
29
34
  end
30
35
 
36
+ def stop
37
+ pool.shutdown
38
+ pool.kill unless pool.wait_for_termination(30)
39
+ browser.quit
40
+ end
41
+
31
42
  private
32
43
 
33
44
  def pool
@@ -39,7 +50,10 @@ module Vessel
39
50
  end
40
51
 
41
52
  def goto(request)
53
+ return [nil, request] if request.stub?
54
+
42
55
  page = browser.create_page
56
+ page.headers.set(headers) if headers
43
57
  # Delay is set between requests when we don't want to bombard server with
44
58
  # requests so it requires crawler to be single threaded. Otherwise doesn't
45
59
  # make sense.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vessel
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vessel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Vorotilin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-06 00:00:00.000000000 Z
11
+ date: 2021-03-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ferrum
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '0.4'
19
+ version: '0.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '0.4'
26
+ version: '0.8'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: thor
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.20'
33
+ version: '1.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.20'
40
+ version: '1.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -117,7 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
117
117
  - !ruby/object:Gem::Version
118
118
  version: '0'
119
119
  requirements: []
120
- rubygems_version: 3.0.3
120
+ rubygems_version: 3.1.4
121
121
  signing_key:
122
122
  specification_version: 4
123
123
  summary: High-level web crawling framework