vessel 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44fb472d4afaf916edc97894dcc39cf8b6bfbf3f8f1f0b2e8a47f495482b1bd9
4
- data.tar.gz: 36af4cd9021bd410bf1988c01f97d98df4e5646f5a56416004869fe643403672
3
+ metadata.gz: 291c528925e5f6990b8223762c00a996cf8435325b52048c1c0338fc873d1a42
4
+ data.tar.gz: 5c845e2c6b19adea7742bbad3a031fe0ad5f09a3a0f8ceff248970cc2bb74fcb
5
5
  SHA512:
6
- metadata.gz: bda3863083cdce0e8011675a0e83a583d626e81ab713803a54c5056f922d4822b069dacd8d4e5f0079d4f8625a172f7f9d30d4e3586439137af088ac0911201e
7
- data.tar.gz: 205b2f54fa17283daf50d0fdaa96e67f5dec4bed2c69ccc740433c90ecefaa9c4b1e13740cb703ac56cfe8c92b2df9da436fee94fc7937242465a33e91a088f5
6
+ metadata.gz: 77d65c60e1541c1ab1473b64bd7c451e3d57199fa19885bcf4e182d295321d0cdd36121cc673a9285a7113b482ec9143607d17a4017632d1d6826b5c5ad48b7e
7
+ data.tar.gz: 2a0e9bbe6de79e9ba6e7afed4d422de6b72639d91b508a921dd4bba2cf0b8e878a3de6846fca87de25e20047594b153efcf70ca3b91d373190bf976605e0bb2d
data/README.md CHANGED
@@ -97,23 +97,5 @@ To be continued
97
97
 
98
98
  ## License
99
99
 
100
- Copyright 2018-2020 Machinio
101
-
102
- Permission is hereby granted, free of charge, to any person obtaining
103
- a copy of this software and associated documentation files (the
104
- "Software"), to deal in the Software without restriction, including
105
- without limitation the rights to use, copy, modify, merge, publish,
106
- distribute, sublicense, and/or sell copies of the Software, and to
107
- permit persons to whom the Software is furnished to do so, subject to
108
- the following conditions:
109
-
110
- The above copyright notice and this permission notice shall be
111
- included in all copies or substantial portions of the Software.
112
-
113
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
114
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
115
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
116
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
117
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
118
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
119
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
100
+ The gem is available as open source under the terms of the
101
+ [MIT License](https://opensource.org/licenses/MIT).
data/lib/vessel/cargo.rb CHANGED
@@ -24,7 +24,7 @@ module Vessel
24
24
  end
25
25
 
26
26
  def start_urls(*urls)
27
- settings[:start_urls] = urls
27
+ settings[:start_urls] = urls.flatten
28
28
  end
29
29
 
30
30
  def delay(value)
@@ -35,6 +35,10 @@ module Vessel
35
35
  settings[:timeout] = value
36
36
  end
37
37
 
38
+ def headers(value)
39
+ settings[:headers] = value
40
+ end
41
+
38
42
  def threads(min: MIN_THREADS, max: MAX_THREADS)
39
43
  settings[:min_threads] = min
40
44
  settings[:max_threads] = max
@@ -44,6 +48,14 @@ module Vessel
44
48
  settings[:middleware] = classes
45
49
  end
46
50
 
51
+ def ferrum(**options)
52
+ settings[:ferrum] = options
53
+ end
54
+
55
+ def intercept(&block)
56
+ settings[:intercept] = block
57
+ end
58
+
47
59
  def settings
48
60
  @settings ||= {
49
61
  delay: DELAY,
@@ -51,6 +63,9 @@ module Vessel
51
63
  start_urls: START_URLS,
52
64
  min_threads: MIN_THREADS,
53
65
  max_threads: MAX_THREADS,
66
+ ferrum: Hash.new,
67
+ intercept: nil,
68
+ headers: nil,
54
69
  domain: name&.split('::')&.last&.downcase
55
70
  }
56
71
  end
@@ -82,5 +97,9 @@ module Vessel
82
97
  def absolute_url(relative)
83
98
  Addressable::URI.join(page.current_url, relative).to_s
84
99
  end
100
+
101
+ def current_url
102
+ Addressable::URI.parse(page.current_url)
103
+ end
85
104
  end
86
105
  end
data/lib/vessel/engine.rb CHANGED
@@ -21,25 +21,35 @@ module Vessel
21
21
 
22
22
  until @queue.closed?
23
23
  message = @queue.pop
24
+
24
25
  raise(message) if message.is_a?(Exception)
25
- handle(*message)
26
+
27
+ page, request = message
28
+ args = [request.method, request.data].compact
29
+ handle(page, args)
30
+
26
31
  @queue.close if idle?
27
32
  end
33
+
34
+ ensure
35
+ scheduler.stop
28
36
  end
29
37
 
30
- def handle(page, request)
38
+ def handle(page, args)
31
39
  crawler = @crawler_class.new(page)
32
- crawler.send(request.method) do |*args|
33
- if args.all? { |i| i.is_a?(Request) }
34
- scheduler.post(*args)
40
+ crawler.send(*args) do |*result|
41
+ if result.flatten.all? { |i| i.is_a?(Request) }
42
+ scheduler.post(*result.flatten)
35
43
  else
36
- @middleware&.call(*args)
44
+ @middleware&.call(*result)
37
45
  end
38
46
  end
39
47
  ensure
40
- page.close
48
+ page.close if page
41
49
  end
42
50
 
51
+ private
52
+
43
53
  def start_requests
44
54
  Request.build(*settings[:start_urls])
45
55
  end
@@ -4,16 +4,24 @@ require "addressable/uri"
4
4
 
5
5
  module Vessel
6
6
  class Request
7
- attr_reader :url, :uri, :method
7
+ attr_reader :url, :uri, :method, :data
8
8
 
9
9
  def self.build(*urls)
10
- urls.map { |url| new(url: url) }
10
+ urls.empty? ? [new] : urls.map { |url| new(url: url) }
11
11
  end
12
12
 
13
- def initialize(url:, method: :parse)
14
- @url = url.to_s
15
- @uri = Addressable::URI.parse(@url)
13
+ def initialize(url: nil, method: :parse, data: nil)
14
+ if url
15
+ @url = url.to_s
16
+ @uri = Addressable::URI.parse(@url)
17
+ end
18
+
16
19
  @method = method
20
+ @data = data.freeze if data
21
+ end
22
+
23
+ def stub?
24
+ !url
17
25
  end
18
26
  end
19
27
  end
@@ -6,18 +6,23 @@ require "concurrent-ruby"
6
6
  module Vessel
7
7
  class Scheduler
8
8
  extend Forwardable
9
- delegate %i[scheduled_task_count completed_task_count queue_length] => :@pool
9
+ delegate %i[scheduled_task_count completed_task_count queue_length] => :pool
10
10
 
11
- attr_reader :browser, :queue, :delay
11
+ attr_reader :browser, :queue, :delay, :headers
12
12
 
13
13
  def initialize(queue, settings)
14
14
  @queue = queue
15
- @min_threads, @max_threads, @delay =
16
- settings.values_at(:min_threads, :max_threads, :delay)
15
+ @min_threads, @max_threads, @delay, @headers =
16
+ settings.values_at(:min_threads, :max_threads, :delay, :headers)
17
17
 
18
- options = {}
18
+ options = settings[:ferrum]
19
19
  options.merge!(timeout: settings[:timeout]) if settings[:timeout]
20
20
  @browser = Ferrum::Browser.new(**options)
21
+
22
+ if settings[:intercept]
23
+ @browser.network.intercept
24
+ @browser.on(:request, &settings[:intercept])
25
+ end
21
26
  end
22
27
 
23
28
  def post(*requests)
@@ -28,6 +33,12 @@ module Vessel
28
33
  end
29
34
  end
30
35
 
36
+ def stop
37
+ pool.shutdown
38
+ pool.kill unless pool.wait_for_termination(30)
39
+ browser.quit
40
+ end
41
+
31
42
  private
32
43
 
33
44
  def pool
@@ -39,7 +50,10 @@ module Vessel
39
50
  end
40
51
 
41
52
  def goto(request)
53
+ return [nil, request] if request.stub?
54
+
42
55
  page = browser.create_page
56
+ page.headers.set(headers) if headers
43
57
  # Delay is set between requests when we don't want to bombard server with
44
58
  # requests so it requires crawler to be single threaded. Otherwise doesn't
45
59
  # make sense.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vessel
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vessel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Vorotilin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-06 00:00:00.000000000 Z
11
+ date: 2021-03-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ferrum
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '0.4'
19
+ version: '0.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '0.4'
26
+ version: '0.8'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: thor
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0.20'
33
+ version: '1.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0.20'
40
+ version: '1.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -117,7 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
117
117
  - !ruby/object:Gem::Version
118
118
  version: '0'
119
119
  requirements: []
120
- rubygems_version: 3.0.3
120
+ rubygems_version: 3.1.4
121
121
  signing_key:
122
122
  specification_version: 4
123
123
  summary: High-level web crawling framework