cangrejo 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cangrejo/errors.rb +19 -0
- data/lib/cangrejo/modes/local.rb +62 -9
- data/lib/cangrejo/session.rb +19 -4
- data/lib/cangrejo/version.rb +1 -1
- metadata +116 -28
- data/lib/cangrejo/support/launcher.rb +0 -79
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 654c44f4ed8700ac8dbb214f8fd3006540ee31cb
|
4
|
+
data.tar.gz: cdf70051e44adbee2ca3291061ddd2ac71155571
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 847580c48eb860475720e652e7ee83dbb66f91aa7c88c2a15635296f0aa9471b7d63b67209a60b532a7ed421331aa8fc98b5c3721e68e6b913a2bc1edaf98b5d
|
7
|
+
data.tar.gz: e42eda249f094f9eca98b8a67f2f1bf77de681fe4c20e8538872ec46977a8cd5f255b9c96d2907cbc0dc72baddb71eb8d1664df1d30a92caa2d2238c66ff6984
|
data/lib/cangrejo/errors.rb
CHANGED
@@ -4,4 +4,23 @@ module Cangrejo
|
|
4
4
|
|
5
5
|
class ConfigurationError < Error; end
|
6
6
|
|
7
|
+
class LaunchTimeout < Error
|
8
|
+
def initialize(_msg)
|
9
|
+
super "Timed out trying to start crawler"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class CrawlerError < Error
|
14
|
+
|
15
|
+
def initialize(_msg, _backtrace)
|
16
|
+
super _msg
|
17
|
+
@original_bt = _backtrace
|
18
|
+
end
|
19
|
+
|
20
|
+
def set_backtrace(_backtrace)
|
21
|
+
super @original_bt + _backtrace
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
7
26
|
end
|
data/lib/cangrejo/modes/local.rb
CHANGED
@@ -1,41 +1,94 @@
|
|
1
1
|
require "cangrejo/restclient/request_extensions"
|
2
2
|
require "cangrejo/restclient/json_resource"
|
3
|
-
require "
|
3
|
+
require "childprocess"
|
4
4
|
|
5
5
|
module Cangrejo
|
6
6
|
module Modes
|
7
7
|
class Local
|
8
8
|
|
9
|
+
attr_reader :process, :path
|
10
|
+
|
9
11
|
def initialize(_path)
|
10
12
|
@path = _path
|
11
13
|
end
|
12
14
|
|
13
15
|
def setup
|
14
|
-
|
16
|
+
select_socket_file
|
17
|
+
start_process
|
18
|
+
wait_for_socket
|
15
19
|
init_rest_client
|
16
20
|
end
|
17
21
|
|
18
22
|
def release
|
19
|
-
|
23
|
+
process.stop unless process.nil?
|
20
24
|
end
|
21
25
|
|
22
26
|
private
|
23
27
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
28
|
+
def cmd_enviroment
|
29
|
+
{
|
30
|
+
'BUNDLE_GEMFILE' => gem_path
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def cmd_arguments
|
35
|
+
[
|
36
|
+
'--no-reload'
|
37
|
+
]
|
27
38
|
end
|
28
39
|
|
29
40
|
def launch_timeout
|
30
41
|
5.0
|
31
42
|
end
|
32
43
|
|
33
|
-
def
|
34
|
-
|
44
|
+
def start_process
|
45
|
+
@process = prepare_process
|
46
|
+
@process.start
|
47
|
+
end
|
48
|
+
|
49
|
+
def prepare_process
|
50
|
+
cmd = [ "bin/crabfarm", "s", "--host=#{host}" ]
|
51
|
+
cmd += cmd_arguments
|
52
|
+
|
53
|
+
puts cmd.join(' ')
|
54
|
+
|
55
|
+
cp = ChildProcess.build(*cmd)
|
56
|
+
cp.environment.merge! cmd_enviroment
|
57
|
+
cp.cwd = @path
|
58
|
+
cp.leader = true
|
59
|
+
cp.io.inherit!
|
60
|
+
|
61
|
+
return cp
|
62
|
+
end
|
63
|
+
|
64
|
+
def wait_for_socket
|
65
|
+
Timeout::timeout(launch_timeout, LaunchTimeout) do
|
66
|
+
# TODO: detect if the process crashes before timeout
|
67
|
+
sleep 0.1 while not File.exist? @socket_file
|
68
|
+
end
|
35
69
|
end
|
36
70
|
|
37
71
|
def init_rest_client
|
38
|
-
RestClient::JsonResource.new Net::SocketUri.new(
|
72
|
+
RestClient::JsonResource.new Net::SocketUri.new(host, '/api/state')
|
73
|
+
end
|
74
|
+
|
75
|
+
def select_socket_file
|
76
|
+
@socket_file = random_filename while @socket_file.nil? or File.exist? @socket_file
|
77
|
+
end
|
78
|
+
|
79
|
+
def random_filename
|
80
|
+
File.join(Cangrejo.config.temp_path, "csocket-#{Random.rand(1000000)}.sock")
|
81
|
+
end
|
82
|
+
|
83
|
+
def gem_path
|
84
|
+
File.join(@path, 'Gemfile')
|
85
|
+
end
|
86
|
+
|
87
|
+
def host
|
88
|
+
# TODO: add posibility to use ports instead of unix sockets, it would also
|
89
|
+
# be nice to have a mechanism where the loaded process reports the port it
|
90
|
+
# binded to.
|
91
|
+
"unix://#{@socket_file}"
|
39
92
|
end
|
40
93
|
|
41
94
|
end
|
data/lib/cangrejo/session.rb
CHANGED
@@ -53,16 +53,22 @@ module Cangrejo
|
|
53
53
|
|
54
54
|
params = add_timestamp(_params)
|
55
55
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
56
|
+
begin
|
57
|
+
while_times_out do
|
58
|
+
@rest.put(name: _state, params: params, wait: WAIT_STEP)
|
59
|
+
@state_name = _state
|
60
|
+
@state_params = _params
|
61
|
+
end
|
62
|
+
rescue RestClient::InternalServerError => exc
|
63
|
+
raise unwrap_packed_exception exc
|
60
64
|
end
|
61
65
|
|
62
66
|
wrap_response_doc
|
63
67
|
self
|
64
68
|
end
|
65
69
|
|
70
|
+
alias :navigate :crawl
|
71
|
+
|
66
72
|
def release
|
67
73
|
@mode.release
|
68
74
|
@rest = nil
|
@@ -107,5 +113,14 @@ module Cangrejo
|
|
107
113
|
end
|
108
114
|
end
|
109
115
|
|
116
|
+
def unwrap_packed_exception(_exc)
|
117
|
+
begin
|
118
|
+
data = JSON.parse _exc.http_body
|
119
|
+
Cangrejo::CrawlerError.new data['exception'], data['backtrace']
|
120
|
+
rescue
|
121
|
+
_exc
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
110
125
|
end
|
111
126
|
end
|
data/lib/cangrejo/version.rb
CHANGED
metadata
CHANGED
@@ -1,92 +1,183 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cangrejo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rest-client
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.7
|
19
|
+
version: '1.7'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.7
|
26
|
+
version: '1.7'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: git
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: childprocess
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.5'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.5'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: bundler
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
|
-
- - ~>
|
59
|
+
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
61
|
version: '1.6'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
|
-
- - ~>
|
66
|
+
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '1.6'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: rake
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
|
-
- -
|
73
|
+
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
75
|
+
version: '10.4'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
82
|
+
version: '10.4'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: rspec
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- -
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.1'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.1'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec-nc
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.2'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.2'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: guard
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.11'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '2.11'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: guard-rspec
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '4.5'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '4.5'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: terminal-notifier-guard
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
74
144
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
145
|
+
version: '1.6'
|
146
|
+
- - ">="
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: 1.6.1
|
76
149
|
type: :development
|
77
150
|
prerelease: false
|
78
151
|
version_requirements: !ruby/object:Gem::Requirement
|
79
152
|
requirements:
|
80
|
-
- -
|
153
|
+
- - "~>"
|
154
|
+
- !ruby/object:Gem::Version
|
155
|
+
version: '1.6'
|
156
|
+
- - ">="
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: 1.6.1
|
159
|
+
- !ruby/object:Gem::Dependency
|
160
|
+
name: sys-proctable
|
161
|
+
requirement: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - "~>"
|
81
164
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
|
165
|
+
version: '0.9'
|
166
|
+
type: :development
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - "~>"
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0.9'
|
173
|
+
description: Cangrejo lets you consume crabfarm crawlers using a simple DSL
|
84
174
|
email:
|
85
175
|
- ignacio@platan.us
|
86
176
|
executables: []
|
87
177
|
extensions: []
|
88
178
|
extra_rdoc_files: []
|
89
179
|
files:
|
180
|
+
- lib/cangrejo.rb
|
90
181
|
- lib/cangrejo/configurator.rb
|
91
182
|
- lib/cangrejo/errors.rb
|
92
183
|
- lib/cangrejo/modes/git.rb
|
@@ -97,10 +188,8 @@ files:
|
|
97
188
|
- lib/cangrejo/restclient/json_resource.rb
|
98
189
|
- lib/cangrejo/restclient/request_extensions.rb
|
99
190
|
- lib/cangrejo/session.rb
|
100
|
-
- lib/cangrejo/support/launcher.rb
|
101
191
|
- lib/cangrejo/version.rb
|
102
|
-
|
103
|
-
homepage: ''
|
192
|
+
homepage: https://github.com/platanus/cangrejo-gem
|
104
193
|
licenses:
|
105
194
|
- MIT
|
106
195
|
metadata: {}
|
@@ -110,19 +199,18 @@ require_paths:
|
|
110
199
|
- lib
|
111
200
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
201
|
requirements:
|
113
|
-
- -
|
202
|
+
- - ">="
|
114
203
|
- !ruby/object:Gem::Version
|
115
204
|
version: '0'
|
116
205
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
206
|
requirements:
|
118
|
-
- -
|
207
|
+
- - ">="
|
119
208
|
- !ruby/object:Gem::Version
|
120
209
|
version: '0'
|
121
210
|
requirements: []
|
122
211
|
rubyforge_project:
|
123
|
-
rubygems_version: 2.
|
212
|
+
rubygems_version: 2.4.5
|
124
213
|
signing_key:
|
125
214
|
specification_version: 4
|
126
215
|
summary: Crabfarm client for ruby
|
127
216
|
test_files: []
|
128
|
-
has_rdoc:
|
@@ -1,79 +0,0 @@
|
|
1
|
-
require 'timeout'
|
2
|
-
|
3
|
-
module Cangrejo
|
4
|
-
module Support
|
5
|
-
class Launcher
|
6
|
-
|
7
|
-
class LaunchTimeout < Cangrejo::Error
|
8
|
-
def initialize(_msg)
|
9
|
-
super "Timed out trying to start crawler"
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
SPAWN_TIMEOUT = 5
|
14
|
-
KILL_TIMEOUT = 5
|
15
|
-
|
16
|
-
def initialize(_path, _options={})
|
17
|
-
@path = _path
|
18
|
-
@timeout = _options.fetch(:timeout, SPAWN_TIMEOUT)
|
19
|
-
@argv = _options.fetch(:argv, [])
|
20
|
-
select_socket_file
|
21
|
-
end
|
22
|
-
|
23
|
-
def host
|
24
|
-
"unix://#{@socket_file}"
|
25
|
-
end
|
26
|
-
|
27
|
-
def launch
|
28
|
-
gem_path = File.join(@path, 'Gemfile')
|
29
|
-
# TODO: for some reason, the gemfile path must be specified here, maybe because of rbenv?
|
30
|
-
@pid = Process.spawn({ 'BUNDLE_GEMFILE' => gem_path }, "bin/crabfarm s --host=#{host} #{@argv.join(' ')}", chdir: @path, pgroup: true)
|
31
|
-
wait_for_socket
|
32
|
-
end
|
33
|
-
|
34
|
-
def kill
|
35
|
-
safe_kill @pid unless @pid.nil?
|
36
|
-
end
|
37
|
-
|
38
|
-
private
|
39
|
-
|
40
|
-
def select_socket_file
|
41
|
-
@socket_file = random_filename while @socket_file.nil? or File.exist? @socket_file
|
42
|
-
end
|
43
|
-
|
44
|
-
def random_filename
|
45
|
-
File.join(Cangrejo.config.temp_path, "csocket-#{Random.rand(1000000)}.sock")
|
46
|
-
end
|
47
|
-
|
48
|
-
def wait_for_socket
|
49
|
-
Timeout::timeout(@timeout, LaunchTimeout) do
|
50
|
-
# TODO: detect if the process crashes before timeout
|
51
|
-
sleep 0.1 while not File.exist? @socket_file
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def safe_kill _pid
|
56
|
-
begin
|
57
|
-
Timeout.timeout(KILL_TIMEOUT) do
|
58
|
-
Process.kill "INT", _pid
|
59
|
-
Process.wait _pid
|
60
|
-
end
|
61
|
-
rescue Timeout::Error
|
62
|
-
ensure
|
63
|
-
ensure_dead _pid
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def ensure_dead _pid
|
68
|
-
begin
|
69
|
-
# Kill the entire process group to make sure childs aren't left hanging around
|
70
|
-
Process.kill(-9, _pid)
|
71
|
-
Process.wait _pid
|
72
|
-
rescue
|
73
|
-
nil
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|