vessel 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -20
- data/lib/vessel/cargo.rb +20 -1
- data/lib/vessel/engine.rb +17 -7
- data/lib/vessel/request.rb +13 -5
- data/lib/vessel/scheduler.rb +19 -5
- data/lib/vessel/version.rb +1 -1
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 291c528925e5f6990b8223762c00a996cf8435325b52048c1c0338fc873d1a42
|
4
|
+
data.tar.gz: 5c845e2c6b19adea7742bbad3a031fe0ad5f09a3a0f8ceff248970cc2bb74fcb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 77d65c60e1541c1ab1473b64bd7c451e3d57199fa19885bcf4e182d295321d0cdd36121cc673a9285a7113b482ec9143607d17a4017632d1d6826b5c5ad48b7e
|
7
|
+
data.tar.gz: 2a0e9bbe6de79e9ba6e7afed4d422de6b72639d91b508a921dd4bba2cf0b8e878a3de6846fca87de25e20047594b153efcf70ca3b91d373190bf976605e0bb2d
|
data/README.md
CHANGED
@@ -97,23 +97,5 @@ To be continued
|
|
97
97
|
|
98
98
|
## License
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
103
|
-
a copy of this software and associated documentation files (the
|
104
|
-
"Software"), to deal in the Software without restriction, including
|
105
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
106
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
107
|
-
permit persons to whom the Software is furnished to do so, subject to
|
108
|
-
the following conditions:
|
109
|
-
|
110
|
-
The above copyright notice and this permission notice shall be
|
111
|
-
included in all copies or substantial portions of the Software.
|
112
|
-
|
113
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
114
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
115
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
116
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
117
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
118
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
119
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
100
|
+
The gem is available as open source under the terms of the
|
101
|
+
[MIT License](https://opensource.org/licenses/MIT).
|
data/lib/vessel/cargo.rb
CHANGED
@@ -24,7 +24,7 @@ module Vessel
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def start_urls(*urls)
|
27
|
-
settings[:start_urls] = urls
|
27
|
+
settings[:start_urls] = urls.flatten
|
28
28
|
end
|
29
29
|
|
30
30
|
def delay(value)
|
@@ -35,6 +35,10 @@ module Vessel
|
|
35
35
|
settings[:timeout] = value
|
36
36
|
end
|
37
37
|
|
38
|
+
def headers(value)
|
39
|
+
settings[:headers] = value
|
40
|
+
end
|
41
|
+
|
38
42
|
def threads(min: MIN_THREADS, max: MAX_THREADS)
|
39
43
|
settings[:min_threads] = min
|
40
44
|
settings[:max_threads] = max
|
@@ -44,6 +48,14 @@ module Vessel
|
|
44
48
|
settings[:middleware] = classes
|
45
49
|
end
|
46
50
|
|
51
|
+
def ferrum(**options)
|
52
|
+
settings[:ferrum] = options
|
53
|
+
end
|
54
|
+
|
55
|
+
def intercept(&block)
|
56
|
+
settings[:intercept] = block
|
57
|
+
end
|
58
|
+
|
47
59
|
def settings
|
48
60
|
@settings ||= {
|
49
61
|
delay: DELAY,
|
@@ -51,6 +63,9 @@ module Vessel
|
|
51
63
|
start_urls: START_URLS,
|
52
64
|
min_threads: MIN_THREADS,
|
53
65
|
max_threads: MAX_THREADS,
|
66
|
+
ferrum: Hash.new,
|
67
|
+
intercept: nil,
|
68
|
+
headers: nil,
|
54
69
|
domain: name&.split('::')&.last&.downcase
|
55
70
|
}
|
56
71
|
end
|
@@ -82,5 +97,9 @@ module Vessel
|
|
82
97
|
def absolute_url(relative)
|
83
98
|
Addressable::URI.join(page.current_url, relative).to_s
|
84
99
|
end
|
100
|
+
|
101
|
+
def current_url
|
102
|
+
Addressable::URI.parse(page.current_url)
|
103
|
+
end
|
85
104
|
end
|
86
105
|
end
|
data/lib/vessel/engine.rb
CHANGED
@@ -21,25 +21,35 @@ module Vessel
|
|
21
21
|
|
22
22
|
until @queue.closed?
|
23
23
|
message = @queue.pop
|
24
|
+
|
24
25
|
raise(message) if message.is_a?(Exception)
|
25
|
-
|
26
|
+
|
27
|
+
page, request = message
|
28
|
+
args = [request.method, request.data].compact
|
29
|
+
handle(page, args)
|
30
|
+
|
26
31
|
@queue.close if idle?
|
27
32
|
end
|
33
|
+
|
34
|
+
ensure
|
35
|
+
scheduler.stop
|
28
36
|
end
|
29
37
|
|
30
|
-
def handle(page,
|
38
|
+
def handle(page, args)
|
31
39
|
crawler = @crawler_class.new(page)
|
32
|
-
crawler.send(
|
33
|
-
if
|
34
|
-
scheduler.post(*
|
40
|
+
crawler.send(*args) do |*result|
|
41
|
+
if result.flatten.all? { |i| i.is_a?(Request) }
|
42
|
+
scheduler.post(*result.flatten)
|
35
43
|
else
|
36
|
-
@middleware&.call(*
|
44
|
+
@middleware&.call(*result)
|
37
45
|
end
|
38
46
|
end
|
39
47
|
ensure
|
40
|
-
page.close
|
48
|
+
page.close if page
|
41
49
|
end
|
42
50
|
|
51
|
+
private
|
52
|
+
|
43
53
|
def start_requests
|
44
54
|
Request.build(*settings[:start_urls])
|
45
55
|
end
|
data/lib/vessel/request.rb
CHANGED
@@ -4,16 +4,24 @@ require "addressable/uri"
|
|
4
4
|
|
5
5
|
module Vessel
|
6
6
|
class Request
|
7
|
-
attr_reader :url, :uri, :method
|
7
|
+
attr_reader :url, :uri, :method, :data
|
8
8
|
|
9
9
|
def self.build(*urls)
|
10
|
-
urls.map { |url| new(url: url) }
|
10
|
+
urls.empty? ? [new] : urls.map { |url| new(url: url) }
|
11
11
|
end
|
12
12
|
|
13
|
-
def initialize(url
|
14
|
-
|
15
|
-
|
13
|
+
def initialize(url: nil, method: :parse, data: nil)
|
14
|
+
if url
|
15
|
+
@url = url.to_s
|
16
|
+
@uri = Addressable::URI.parse(@url)
|
17
|
+
end
|
18
|
+
|
16
19
|
@method = method
|
20
|
+
@data = data.freeze if data
|
21
|
+
end
|
22
|
+
|
23
|
+
def stub?
|
24
|
+
!url
|
17
25
|
end
|
18
26
|
end
|
19
27
|
end
|
data/lib/vessel/scheduler.rb
CHANGED
@@ -6,18 +6,23 @@ require "concurrent-ruby"
|
|
6
6
|
module Vessel
|
7
7
|
class Scheduler
|
8
8
|
extend Forwardable
|
9
|
-
delegate %i[scheduled_task_count completed_task_count queue_length] =>
|
9
|
+
delegate %i[scheduled_task_count completed_task_count queue_length] => :pool
|
10
10
|
|
11
|
-
attr_reader :browser, :queue, :delay
|
11
|
+
attr_reader :browser, :queue, :delay, :headers
|
12
12
|
|
13
13
|
def initialize(queue, settings)
|
14
14
|
@queue = queue
|
15
|
-
@min_threads, @max_threads, @delay =
|
16
|
-
settings.values_at(:min_threads, :max_threads, :delay)
|
15
|
+
@min_threads, @max_threads, @delay, @headers =
|
16
|
+
settings.values_at(:min_threads, :max_threads, :delay, :headers)
|
17
17
|
|
18
|
-
options =
|
18
|
+
options = settings[:ferrum]
|
19
19
|
options.merge!(timeout: settings[:timeout]) if settings[:timeout]
|
20
20
|
@browser = Ferrum::Browser.new(**options)
|
21
|
+
|
22
|
+
if settings[:intercept]
|
23
|
+
@browser.network.intercept
|
24
|
+
@browser.on(:request, &settings[:intercept])
|
25
|
+
end
|
21
26
|
end
|
22
27
|
|
23
28
|
def post(*requests)
|
@@ -28,6 +33,12 @@ module Vessel
|
|
28
33
|
end
|
29
34
|
end
|
30
35
|
|
36
|
+
def stop
|
37
|
+
pool.shutdown
|
38
|
+
pool.kill unless pool.wait_for_termination(30)
|
39
|
+
browser.quit
|
40
|
+
end
|
41
|
+
|
31
42
|
private
|
32
43
|
|
33
44
|
def pool
|
@@ -39,7 +50,10 @@ module Vessel
|
|
39
50
|
end
|
40
51
|
|
41
52
|
def goto(request)
|
53
|
+
return [nil, request] if request.stub?
|
54
|
+
|
42
55
|
page = browser.create_page
|
56
|
+
page.headers.set(headers) if headers
|
43
57
|
# Delay is set between requests when we don't want to bombard server with
|
44
58
|
# requests so it requires crawler to be single threaded. Otherwise doesn't
|
45
59
|
# make sense.
|
data/lib/vessel/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vessel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Vorotilin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ferrum
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0.
|
19
|
+
version: '0.8'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0.
|
26
|
+
version: '0.8'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: thor
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0
|
33
|
+
version: '1.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0
|
40
|
+
version: '1.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: bundler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -117,7 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
117
117
|
- !ruby/object:Gem::Version
|
118
118
|
version: '0'
|
119
119
|
requirements: []
|
120
|
-
rubygems_version: 3.
|
120
|
+
rubygems_version: 3.1.4
|
121
121
|
signing_key:
|
122
122
|
specification_version: 4
|
123
123
|
summary: High-level web crawling framework
|