skylight 0.3.21 → 0.4.0.alpha1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +0 -4
- data/ext/extconf.rb +92 -47
- data/ext/libskylight.yml +4 -4
- data/ext/skylight_native.c +248 -286
- data/lib/skylight.rb +19 -114
- data/lib/skylight/api.rb +1 -1
- data/lib/skylight/config.rb +176 -146
- data/lib/skylight/data/cacert.pem +717 -719
- data/lib/skylight/formatters/http.rb +1 -1
- data/lib/skylight/instrumenter.rb +28 -35
- data/lib/skylight/native.rb +58 -72
- data/lib/skylight/normalizers.rb +0 -1
- data/lib/skylight/normalizers/active_record/sql.rb +0 -4
- data/lib/skylight/probes/excon/middleware.rb +3 -1
- data/lib/skylight/probes/net_http.rb +3 -1
- data/lib/skylight/subscriber.rb +0 -4
- data/lib/skylight/trace.rb +189 -0
- data/lib/skylight/util.rb +10 -12
- data/lib/skylight/util/hostname.rb +17 -0
- data/lib/skylight/util/http.rb +33 -36
- data/lib/skylight/util/logging.rb +20 -1
- data/lib/skylight/util/multi_io.rb +21 -0
- data/lib/skylight/util/native_ext_fetcher.rb +83 -69
- data/lib/skylight/util/platform.rb +67 -0
- data/lib/skylight/util/ssl.rb +50 -0
- data/lib/skylight/version.rb +1 -1
- metadata +9 -34
- data/ext/rust_support/ruby.h +0 -93
- data/ext/skylight.h +0 -85
- data/ext/skylight.map +0 -4
- data/ext/test/extconf.rb +0 -18
- data/ext/test/skylight_native_test.c +0 -82
- data/ext/test/skylight_test.h +0 -20
- data/lib/skylight/formatters.rb +0 -6
- data/lib/skylight/messages.rb +0 -21
- data/lib/skylight/messages/error.rb +0 -15
- data/lib/skylight/messages/hello.rb +0 -13
- data/lib/skylight/messages/trace.rb +0 -179
- data/lib/skylight/messages/trace_envelope.rb +0 -19
- data/lib/skylight/metrics.rb +0 -9
- data/lib/skylight/metrics/ewma.rb +0 -69
- data/lib/skylight/metrics/meter.rb +0 -58
- data/lib/skylight/metrics/process_cpu_gauge.rb +0 -65
- data/lib/skylight/metrics/process_mem_gauge.rb +0 -34
- data/lib/skylight/util/conversions.rb +0 -9
- data/lib/skylight/util/queue.rb +0 -96
- data/lib/skylight/util/task.rb +0 -172
- data/lib/skylight/util/uniform_sample.rb +0 -63
- data/lib/skylight/worker.rb +0 -19
- data/lib/skylight/worker/builder.rb +0 -73
- data/lib/skylight/worker/collector.rb +0 -274
- data/lib/skylight/worker/connection.rb +0 -87
- data/lib/skylight/worker/connection_set.rb +0 -56
- data/lib/skylight/worker/embedded.rb +0 -24
- data/lib/skylight/worker/metrics_reporter.rb +0 -104
- data/lib/skylight/worker/server.rb +0 -336
- data/lib/skylight/worker/standalone.rb +0 -421
@@ -1,336 +0,0 @@
|
|
1
|
-
require 'socket'
|
2
|
-
|
3
|
-
module Skylight
|
4
|
-
module Worker
|
5
|
-
class Server
|
6
|
-
LOCKFILE_PATH = 'SKYLIGHT_LOCKFILE_PATH'.freeze
|
7
|
-
LOCKFILE_ENV_KEY = 'SKYLIGHT_LOCKFILE_FD'.freeze
|
8
|
-
UDS_SRV_FD_KEY = 'SKYLIGHT_UDS_FD'.freeze
|
9
|
-
KEEPALIVE_KEY = 'SKYLIGHT_KEEPALIVE'.freeze
|
10
|
-
|
11
|
-
include Util::Logging
|
12
|
-
|
13
|
-
attr_reader \
|
14
|
-
:pid,
|
15
|
-
:tick,
|
16
|
-
:config,
|
17
|
-
:keepalive,
|
18
|
-
:lockfile_path,
|
19
|
-
:sockfile_path,
|
20
|
-
:last_status_update,
|
21
|
-
:max_memory
|
22
|
-
|
23
|
-
def initialize(config, lockfile, srv, lockfile_path)
|
24
|
-
unless lockfile && srv
|
25
|
-
raise ArgumentError, "lockfile and unix domain server socket are required"
|
26
|
-
end
|
27
|
-
|
28
|
-
@pid = Process.pid
|
29
|
-
@run = true
|
30
|
-
@tick = 1
|
31
|
-
@socks = []
|
32
|
-
@config = config
|
33
|
-
@server = srv
|
34
|
-
@lockfile = lockfile
|
35
|
-
@collector = Collector.build(config)
|
36
|
-
@metrics_reporter = @collector.metrics_reporter
|
37
|
-
@keepalive = @config[:'agent.keepalive']
|
38
|
-
@connections = ConnectionSet.new
|
39
|
-
@lockfile_path = lockfile_path
|
40
|
-
@sockfile_path = @config[:'agent.sockfile_path']
|
41
|
-
@process_mem_gauge = Metrics::ProcessMemGauge.new
|
42
|
-
@process_cpu_gauge = Metrics::ProcessCpuGauge.new
|
43
|
-
@max_memory = @config[:'agent.max_memory']
|
44
|
-
@booted_at = Util::Clock.absolute_secs
|
45
|
-
end
|
46
|
-
|
47
|
-
# Called from skylight.rb on require
|
48
|
-
def self.boot
|
49
|
-
fail = lambda do |msg|
|
50
|
-
STDERR.puts msg
|
51
|
-
exit 1
|
52
|
-
end
|
53
|
-
|
54
|
-
config = Config.load_from_env
|
55
|
-
|
56
|
-
unless fd = ENV[LOCKFILE_ENV_KEY]
|
57
|
-
fail.call "missing lockfile FD"
|
58
|
-
end
|
59
|
-
|
60
|
-
unless fd =~ /^\d+$/
|
61
|
-
fail.call "invalid lockfile FD"
|
62
|
-
end
|
63
|
-
|
64
|
-
begin
|
65
|
-
lockfile = IO.open(fd.to_i)
|
66
|
-
rescue Exception => e
|
67
|
-
fail.call "invalid lockfile FD: #{e.message}"
|
68
|
-
end
|
69
|
-
|
70
|
-
unless lockfile_path = ENV[LOCKFILE_PATH]
|
71
|
-
fail.call "missing lockfile path"
|
72
|
-
end
|
73
|
-
|
74
|
-
unless config[:'agent.sockfile_path']
|
75
|
-
fail.call "missing sockfile path"
|
76
|
-
end
|
77
|
-
|
78
|
-
srv = nil
|
79
|
-
if fd = ENV[UDS_SRV_FD_KEY]
|
80
|
-
srv = UNIXServer.for_fd(fd.to_i)
|
81
|
-
end
|
82
|
-
|
83
|
-
server = new(
|
84
|
-
config,
|
85
|
-
lockfile,
|
86
|
-
srv,
|
87
|
-
lockfile_path)
|
88
|
-
|
89
|
-
server.run
|
90
|
-
end
|
91
|
-
|
92
|
-
def self.exec(cmd, config, lockfile, srv, lockfile_path)
|
93
|
-
env = config.to_env
|
94
|
-
env.merge!(
|
95
|
-
STANDALONE_ENV_KEY => STANDALONE_ENV_VAL,
|
96
|
-
LOCKFILE_PATH => lockfile_path,
|
97
|
-
LOCKFILE_ENV_KEY => lockfile.fileno.to_s)
|
98
|
-
|
99
|
-
if srv
|
100
|
-
env[UDS_SRV_FD_KEY] = srv.fileno.to_s
|
101
|
-
end
|
102
|
-
|
103
|
-
opts = {}
|
104
|
-
args = [env] + cmd + [opts]
|
105
|
-
|
106
|
-
unless RUBY_VERSION < '1.9'
|
107
|
-
[lockfile, srv].each do |io|
|
108
|
-
next unless io
|
109
|
-
fd = io.fileno.to_i
|
110
|
-
opts[fd] = fd
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
Kernel.exec(*args)
|
115
|
-
end
|
116
|
-
|
117
|
-
def run
|
118
|
-
init
|
119
|
-
work
|
120
|
-
ensure
|
121
|
-
cleanup
|
122
|
-
end
|
123
|
-
|
124
|
-
private
|
125
|
-
|
126
|
-
def init
|
127
|
-
# TODO: Not super ideal to always iterate here even if debug mode isn't
|
128
|
-
# enabled, but it's not super perf critical. We will fix when we revamp
|
129
|
-
# logging
|
130
|
-
debug "initializing server; config=%s", config.to_env
|
131
|
-
|
132
|
-
trap('TERM') { @run = false }
|
133
|
-
trap('INT') { @run = false }
|
134
|
-
|
135
|
-
# Register metrics
|
136
|
-
@metrics_reporter.register("worker.memory", @process_mem_gauge)
|
137
|
-
@metrics_reporter.register("worker.cpu", @process_cpu_gauge)
|
138
|
-
@metrics_reporter.register("worker.uptime", lambda { Util::Clock.absolute_secs - @booted_at })
|
139
|
-
@metrics_reporter.register("worker.ipc.open-connections", @connections.open_connections)
|
140
|
-
@metrics_reporter.register("worker.ipc.throughput", @connections.throughput)
|
141
|
-
|
142
|
-
info "starting skylight daemon"
|
143
|
-
@collector.spawn
|
144
|
-
end
|
145
|
-
|
146
|
-
def work
|
147
|
-
t { "server working" }
|
148
|
-
@socks << @server
|
149
|
-
|
150
|
-
now = Time.now.to_i
|
151
|
-
next_sanity_check_at = now + tick
|
152
|
-
had_client_at = now
|
153
|
-
last_status_update = now
|
154
|
-
|
155
|
-
trace "starting IO loop"
|
156
|
-
begin
|
157
|
-
# Wait for something to do
|
158
|
-
r, _, _ = IO.select(@socks, [], [], tick)
|
159
|
-
|
160
|
-
if r
|
161
|
-
r.each do |sock|
|
162
|
-
if sock == @server
|
163
|
-
# If the server socket, accept
|
164
|
-
# the incoming connection
|
165
|
-
if client = accept
|
166
|
-
connect(client)
|
167
|
-
end
|
168
|
-
else
|
169
|
-
# Client socket, lookup the associated connection
|
170
|
-
# state machine.
|
171
|
-
unless conn = @connections[sock]
|
172
|
-
# No associated connection, weird.. bail
|
173
|
-
client_close(sock)
|
174
|
-
next
|
175
|
-
end
|
176
|
-
|
177
|
-
begin
|
178
|
-
# Pop em while we got em
|
179
|
-
while msg = conn.read
|
180
|
-
handle(msg)
|
181
|
-
end
|
182
|
-
rescue SystemCallError, EOFError
|
183
|
-
client_close(sock)
|
184
|
-
rescue IpcProtoError => e
|
185
|
-
error "Server#work - IPC protocol exception: %s", e.message
|
186
|
-
client_close(sock)
|
187
|
-
end
|
188
|
-
end
|
189
|
-
end
|
190
|
-
end
|
191
|
-
|
192
|
-
now = Time.now.to_i
|
193
|
-
|
194
|
-
if @socks.length > 1
|
195
|
-
had_client_at = now
|
196
|
-
end
|
197
|
-
|
198
|
-
if keepalive < now - had_client_at
|
199
|
-
info "no clients for #{keepalive} sec - shutting down"
|
200
|
-
@run = false
|
201
|
-
else
|
202
|
-
if next_sanity_check_at <= now
|
203
|
-
next_sanity_check_at = now + tick
|
204
|
-
sanity_check
|
205
|
-
end
|
206
|
-
|
207
|
-
memory_usage = @process_mem_gauge.call
|
208
|
-
if memory_usage > max_memory
|
209
|
-
raise WorkerStateError, "Memory limit exceeded: #{memory_usage} (max: #{max_memory})"
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
rescue SignalException => e
|
214
|
-
error "Did not handle: #{e.class}"
|
215
|
-
@run = false
|
216
|
-
rescue WorkerStateError => e
|
217
|
-
info "#{e.message} - shutting down"
|
218
|
-
@run = false
|
219
|
-
rescue Exception => e
|
220
|
-
error "Loop exception: %s (%s)\n%s", e.message, e.class, e.backtrace.join("\n")
|
221
|
-
@collector.send_exception(e)
|
222
|
-
return false
|
223
|
-
rescue Object => o
|
224
|
-
error "Unknown object thrown: `%s`", o.to_s
|
225
|
-
@collector.send_exception(o)
|
226
|
-
return false
|
227
|
-
end while @run
|
228
|
-
|
229
|
-
true # Successful return
|
230
|
-
ensure
|
231
|
-
# Send a final metrics report
|
232
|
-
@metrics_reporter.post_report
|
233
|
-
end
|
234
|
-
|
235
|
-
# Handles an incoming message. Will be instances from
|
236
|
-
# the Messages namespace
|
237
|
-
def handle(msg)
|
238
|
-
case msg
|
239
|
-
when nil
|
240
|
-
return
|
241
|
-
when Hello
|
242
|
-
if msg.newer?
|
243
|
-
info "newer version of agent deployed - restarting; curr=%s; new=%s", VERSION, msg.version
|
244
|
-
reload(msg)
|
245
|
-
end
|
246
|
-
when Messages::TraceEnvelope, Error
|
247
|
-
t { "received message" }
|
248
|
-
@collector.submit(msg)
|
249
|
-
when :unknown
|
250
|
-
debug "received unknown message"
|
251
|
-
else
|
252
|
-
debug "recieved: %s", msg
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
def reload(hello)
|
257
|
-
# Close all client connections
|
258
|
-
trace "closing all client connections"
|
259
|
-
clients_close
|
260
|
-
|
261
|
-
# Re-exec the process
|
262
|
-
trace "re-exec"
|
263
|
-
Server.exec(hello.cmd, @config, @lockfile, @server, lockfile_path)
|
264
|
-
end
|
265
|
-
|
266
|
-
def accept
|
267
|
-
@server.accept_nonblock
|
268
|
-
rescue Errno::EWOULDBLOCK, Errno::EAGAIN, Errno::ECONNABORTED
|
269
|
-
end
|
270
|
-
|
271
|
-
def connect(sock)
|
272
|
-
trace "client accepted"
|
273
|
-
@socks << sock
|
274
|
-
@connections.add(sock)
|
275
|
-
end
|
276
|
-
|
277
|
-
def cleanup
|
278
|
-
t { "server cleaning up" }
|
279
|
-
# The lockfile is not deleted. There is no way to atomically ensure
|
280
|
-
# that we are deleting the lockfile for the current process.
|
281
|
-
cleanup_curr_sockfile
|
282
|
-
close
|
283
|
-
@lockfile.close
|
284
|
-
end
|
285
|
-
|
286
|
-
def close
|
287
|
-
@server.close if @server
|
288
|
-
clients_close
|
289
|
-
end
|
290
|
-
|
291
|
-
def clients_close
|
292
|
-
@connections.socks.each do |sock|
|
293
|
-
client_close(sock)
|
294
|
-
end
|
295
|
-
end
|
296
|
-
|
297
|
-
def client_close(sock)
|
298
|
-
trace "closing client connection; fd=%d", sock.fileno
|
299
|
-
@connections.cleanup(sock)
|
300
|
-
@socks.delete(sock)
|
301
|
-
end
|
302
|
-
|
303
|
-
def sockfile
|
304
|
-
"#{sockfile_path}/skylight-#{pid}.sock"
|
305
|
-
end
|
306
|
-
|
307
|
-
def sockfile?
|
308
|
-
File.exist?(sockfile)
|
309
|
-
end
|
310
|
-
|
311
|
-
def cleanup_curr_sockfile
|
312
|
-
File.unlink(sockfile) rescue nil
|
313
|
-
end
|
314
|
-
|
315
|
-
def sanity_check
|
316
|
-
if !File.exist?(lockfile_path)
|
317
|
-
raise WorkerStateError, "lockfile gone"
|
318
|
-
end
|
319
|
-
|
320
|
-
pid = File.read(lockfile_path) rescue nil
|
321
|
-
|
322
|
-
unless pid
|
323
|
-
raise WorkerStateError, "could not read lockfile"
|
324
|
-
end
|
325
|
-
|
326
|
-
unless pid == Process.pid.to_s
|
327
|
-
raise WorkerStateError, "lockfile points to different process"
|
328
|
-
end
|
329
|
-
|
330
|
-
unless sockfile?
|
331
|
-
raise WorkerStateError, "sockfile gone"
|
332
|
-
end
|
333
|
-
end
|
334
|
-
end
|
335
|
-
end
|
336
|
-
end
|
@@ -1,421 +0,0 @@
|
|
1
|
-
require 'socket'
|
2
|
-
require 'thread'
|
3
|
-
require 'fileutils'
|
4
|
-
require 'rbconfig'
|
5
|
-
|
6
|
-
# TODO: Handle cool-off
|
7
|
-
module Skylight
|
8
|
-
module Worker
|
9
|
-
# Handle to the agent subprocess. Manages creation, communication, and
|
10
|
-
# shutdown. Lazily spawns a thread that handles writing messages to the
|
11
|
-
# unix domain socket
|
12
|
-
#
|
13
|
-
class Standalone
|
14
|
-
include Util::Logging
|
15
|
-
|
16
|
-
# Locates skylight_native so that it can be included in the standalone agent startup command
|
17
|
-
def self.locate_skylight_native
|
18
|
-
$LOADED_FEATURES.find do |feature|
|
19
|
-
return feature if feature =~ /skylight_native\.#{RbConfig::CONFIG['DLEXT']}/
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def self.build_subprocess_cmd
|
24
|
-
# Native extension location
|
25
|
-
native_path = locate_skylight_native
|
26
|
-
native_dir = native_path ? File.dirname(native_path) : nil
|
27
|
-
|
28
|
-
paths = [
|
29
|
-
File.expand_path('../../..', __FILE__), # Ruby code root
|
30
|
-
native_dir
|
31
|
-
].uniq.compact
|
32
|
-
|
33
|
-
ret = [ RUBYBIN ]
|
34
|
-
paths.each { |path| ret << "-I" << path }
|
35
|
-
ret << File.expand_path('../../../skylight.rb', __FILE__) # The agent startup script
|
36
|
-
ret
|
37
|
-
end
|
38
|
-
|
39
|
-
# Used to start the standalone agent as well as included in the hello message
|
40
|
-
SUBPROCESS_CMD = build_subprocess_cmd
|
41
|
-
|
42
|
-
# Used to handle starting the thread
|
43
|
-
LOCK = Mutex.new
|
44
|
-
|
45
|
-
attr_reader \
|
46
|
-
:pid,
|
47
|
-
:config,
|
48
|
-
:lockfile,
|
49
|
-
:keepalive,
|
50
|
-
:max_spawns,
|
51
|
-
:spawn_window,
|
52
|
-
:sockfile_path
|
53
|
-
|
54
|
-
def initialize(config, lockfile, server)
|
55
|
-
@pid = nil
|
56
|
-
@sock = nil
|
57
|
-
|
58
|
-
unless config && lockfile && server
|
59
|
-
raise ArgumentError, "all arguments are required"
|
60
|
-
end
|
61
|
-
|
62
|
-
@me = Process.pid
|
63
|
-
@config = config
|
64
|
-
@spawns = []
|
65
|
-
@server = server
|
66
|
-
@lockfile = lockfile
|
67
|
-
@keepalive = config[:'agent.keepalive']
|
68
|
-
@sockfile_path = config[:'agent.sockfile_path']
|
69
|
-
|
70
|
-
# Should be configurable
|
71
|
-
@max_spawns = 3
|
72
|
-
@spawn_window = 5 * 60
|
73
|
-
|
74
|
-
# Writer background processor will accept messages and write them to
|
75
|
-
# the IPC socket
|
76
|
-
@writer = build_queue
|
77
|
-
end
|
78
|
-
|
79
|
-
def spawn(*args)
|
80
|
-
return if @pid
|
81
|
-
|
82
|
-
if __spawn(*args)
|
83
|
-
@writer.spawn
|
84
|
-
true
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
def submit(msg)
|
89
|
-
unless msg.respond_to?(:encode) || msg.respond_to?(:native_serialize)
|
90
|
-
raise ArgumentError, "message not encodable"
|
91
|
-
end
|
92
|
-
|
93
|
-
unless @pid
|
94
|
-
t { "no pid, can't submit: #{msg.inspect}" }
|
95
|
-
return
|
96
|
-
end
|
97
|
-
|
98
|
-
if @me != Process.pid
|
99
|
-
handle_fork
|
100
|
-
end
|
101
|
-
|
102
|
-
@writer.submit(msg, @me)
|
103
|
-
end
|
104
|
-
|
105
|
-
# Shutdown any side task threads. Let the agent process die on it's own.
|
106
|
-
def shutdown
|
107
|
-
# TODO: implement
|
108
|
-
@writer.submit(:SHUTDOWN)
|
109
|
-
@writer.shutdown
|
110
|
-
end
|
111
|
-
|
112
|
-
# Shutdown any side task threads as well as the agent process
|
113
|
-
def shutdown_all
|
114
|
-
# TODO: implement
|
115
|
-
shutdown
|
116
|
-
end
|
117
|
-
|
118
|
-
private
|
119
|
-
|
120
|
-
def __spawn(timeout = 10)
|
121
|
-
if timeout < 2
|
122
|
-
raise ArgumentError, "at least 2 seconds required"
|
123
|
-
end
|
124
|
-
|
125
|
-
start = Time.now
|
126
|
-
|
127
|
-
if @spawns.length >= @max_spawns
|
128
|
-
if @spawn_window >= (start - @spawns.first)
|
129
|
-
trace "too many spawns in window"
|
130
|
-
return false
|
131
|
-
end
|
132
|
-
|
133
|
-
@spawns.unshift
|
134
|
-
end
|
135
|
-
|
136
|
-
@spawns << start
|
137
|
-
|
138
|
-
check_permissions
|
139
|
-
|
140
|
-
lockf = File.open lockfile, File::RDWR | File::CREAT
|
141
|
-
|
142
|
-
spawn_worker(lockf)
|
143
|
-
|
144
|
-
while timeout >= (Time.now - start)
|
145
|
-
if pid = read_lockfile
|
146
|
-
if sockfile?(pid)
|
147
|
-
if sock = connect(pid)
|
148
|
-
trace "connected to unix socket; pid=%s", pid
|
149
|
-
write_msg(sock, build_hello)
|
150
|
-
@sock = sock
|
151
|
-
@pid = pid
|
152
|
-
return true
|
153
|
-
end
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
sleep 0.1
|
158
|
-
end
|
159
|
-
|
160
|
-
trace "failed to spawn worker"
|
161
|
-
return false
|
162
|
-
|
163
|
-
ensure
|
164
|
-
lockf.close rescue nil if lockf
|
165
|
-
end
|
166
|
-
|
167
|
-
def repair
|
168
|
-
@sock.close rescue nil if @sock
|
169
|
-
|
170
|
-
t { "repairing socket" }
|
171
|
-
|
172
|
-
# Attempt to reconnect to the currently known agent PID. If the agent
|
173
|
-
# is still healthy but is simply reloading itself, this should work
|
174
|
-
# just fine.
|
175
|
-
if sock = connect(@pid)
|
176
|
-
t { "reconnected to worker" }
|
177
|
-
@sock = sock
|
178
|
-
# TODO: Should HELLO be sent again?
|
179
|
-
return true
|
180
|
-
end
|
181
|
-
|
182
|
-
debug "failed to reconnect -- attempting worker respawn"
|
183
|
-
|
184
|
-
# Attempt to respawn the agent process
|
185
|
-
unless __spawn
|
186
|
-
debug "could not respawn -- shutting down"
|
187
|
-
|
188
|
-
@pid = nil
|
189
|
-
@sock = nil
|
190
|
-
return false
|
191
|
-
end
|
192
|
-
|
193
|
-
true
|
194
|
-
end
|
195
|
-
|
196
|
-
def writer_tick(msg)
|
197
|
-
if :SHUTDOWN == msg
|
198
|
-
trace "shuting down agent connection"
|
199
|
-
@sock.close if @sock
|
200
|
-
@pid = nil
|
201
|
-
|
202
|
-
return false
|
203
|
-
elsif msg
|
204
|
-
return handle(msg)
|
205
|
-
else
|
206
|
-
begin
|
207
|
-
@sock.read_nonblock(1)
|
208
|
-
rescue Errno::EWOULDBLOCK, Errno::EAGAIN, Errno::EINTR
|
209
|
-
rescue Exception => e
|
210
|
-
trace "bad socket: #{e}"
|
211
|
-
unless repair
|
212
|
-
raise WorkerStateError, "could not repair connection to agent"
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
return true
|
217
|
-
end
|
218
|
-
rescue WorkerStateError => e
|
219
|
-
error "skylight shutting down: %s", e.message
|
220
|
-
return false
|
221
|
-
end
|
222
|
-
|
223
|
-
def handle(msg)
|
224
|
-
2.times do
|
225
|
-
unless sock = @sock
|
226
|
-
return false unless repair
|
227
|
-
sock = @sock
|
228
|
-
end
|
229
|
-
|
230
|
-
if write_msg(sock, msg)
|
231
|
-
return true
|
232
|
-
end
|
233
|
-
|
234
|
-
@sock = nil
|
235
|
-
sock.close rescue nil
|
236
|
-
|
237
|
-
unless repair
|
238
|
-
return false
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
debug "could not handle message; msg=%s", msg.class
|
243
|
-
|
244
|
-
false
|
245
|
-
end
|
246
|
-
|
247
|
-
def write_msg(sock, msg)
|
248
|
-
t { "writing a #{msg.class} on the wire" }
|
249
|
-
id = Messages::KLASS_TO_ID.fetch(msg.class)
|
250
|
-
buf = msg.serialize
|
251
|
-
|
252
|
-
frame = [ id, buf.bytesize ].pack("LL")
|
253
|
-
|
254
|
-
write(sock, frame) && write(sock, buf)
|
255
|
-
end
|
256
|
-
|
257
|
-
SOCK_TIMEOUT_VAL = [ 0, 0.01 * 1_000_000 ].pack("l_2")
|
258
|
-
|
259
|
-
# TODO: Handle configuring the socket with proper timeouts
|
260
|
-
def connect(pid)
|
261
|
-
sock = UNIXSocket.new(sockfile(pid)) rescue nil
|
262
|
-
if sock
|
263
|
-
sock.setsockopt Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, SOCK_TIMEOUT_VAL
|
264
|
-
sock
|
265
|
-
end
|
266
|
-
end
|
267
|
-
|
268
|
-
def write(sock, msg, timeout = 5)
|
269
|
-
msg = msg.to_s
|
270
|
-
cnt = 10
|
271
|
-
|
272
|
-
begin
|
273
|
-
while true
|
274
|
-
res = sock.write_nonblock(msg)
|
275
|
-
|
276
|
-
if res == msg.bytesize
|
277
|
-
return true
|
278
|
-
elsif res > 0
|
279
|
-
msg = msg.byteslice(res..-1)
|
280
|
-
cnt = 10
|
281
|
-
else
|
282
|
-
if 0 <= (cnt -= 1)
|
283
|
-
t { "write failed -- max attempts" }
|
284
|
-
return false
|
285
|
-
end
|
286
|
-
end
|
287
|
-
end
|
288
|
-
rescue Errno::EAGAIN, Errno::EWOULDBLOCK
|
289
|
-
_, socks, = IO.select([], [sock], [], timeout)
|
290
|
-
unless socks == [sock]
|
291
|
-
t { "write timed out" }
|
292
|
-
return false
|
293
|
-
end
|
294
|
-
retry
|
295
|
-
rescue Errno::EINTR
|
296
|
-
raise
|
297
|
-
rescue SystemCallError => e
|
298
|
-
t { fmt "write failed; err=%s", e.class }
|
299
|
-
return false
|
300
|
-
end
|
301
|
-
end
|
302
|
-
|
303
|
-
# Spawn the worker process.
|
304
|
-
def spawn_worker(f)
|
305
|
-
pid = fork do
|
306
|
-
# Note: By default, Ruby will finalize C objects inside the fork. Because those C objects
|
307
|
-
# are shared with the parent, this can cause database connections to disconnect in the
|
308
|
-
# parent process. We need to double-fork for proper semantics, so we disable the GC and
|
309
|
-
# exit! to avoid finalizing shared handles.
|
310
|
-
#
|
311
|
-
# We should continue to look for alternate solutions, and to determine whether there is
|
312
|
-
# still a possible race between the fork and the GC disabling.
|
313
|
-
::GC.disable
|
314
|
-
::Process.setsid
|
315
|
-
exit! if fork
|
316
|
-
|
317
|
-
# Acquire exclusive file lock, exit otherwise
|
318
|
-
unless f.flock(File::LOCK_EX | File::LOCK_NB)
|
319
|
-
exit! 1
|
320
|
-
end
|
321
|
-
|
322
|
-
f.truncate(0)
|
323
|
-
|
324
|
-
# Lock acquired, cleanup old sock files
|
325
|
-
Dir["#{sockfile_path}/skylight-*.sock"].each do |sf|
|
326
|
-
File.unlink(sf) rescue nil
|
327
|
-
end
|
328
|
-
|
329
|
-
pid = Process.pid.to_s
|
330
|
-
|
331
|
-
# Write the pid
|
332
|
-
f.write(pid)
|
333
|
-
f.flush
|
334
|
-
|
335
|
-
sf = sockfile(pid)
|
336
|
-
File.unlink(sf) rescue nil
|
337
|
-
|
338
|
-
t { fmt "opening a new socket; %s", sf }
|
339
|
-
srv = UNIXServer.new(sf)
|
340
|
-
|
341
|
-
unless ENV[TRACE_ENV_KEY]
|
342
|
-
null = File.open "/dev/null", File::RDWR
|
343
|
-
STDIN.reopen null
|
344
|
-
STDOUT.reopen null
|
345
|
-
STDERR.reopen null
|
346
|
-
end
|
347
|
-
|
348
|
-
# Cleanup the ENV
|
349
|
-
ENV['RUBYOPT'] = nil
|
350
|
-
|
351
|
-
@server.exec(SUBPROCESS_CMD, @config, f, srv, lockfile)
|
352
|
-
end
|
353
|
-
|
354
|
-
Process.detach(pid)
|
355
|
-
end
|
356
|
-
|
357
|
-
# If the process was forked, create a new queue and restart the worker
|
358
|
-
def handle_fork
|
359
|
-
LOCK.synchronize do
|
360
|
-
if @me != Process.pid
|
361
|
-
trace "process forked; recovering"
|
362
|
-
# Update the current process ID
|
363
|
-
@me = Process.pid
|
364
|
-
|
365
|
-
# Deal w/ the inherited socket
|
366
|
-
@sock.close rescue nil if @sock
|
367
|
-
@sock = nil
|
368
|
-
|
369
|
-
@writer = build_queue
|
370
|
-
@writer.spawn
|
371
|
-
end
|
372
|
-
end
|
373
|
-
end
|
374
|
-
|
375
|
-
def check_permissions
|
376
|
-
lockfile_root = File.dirname(lockfile)
|
377
|
-
|
378
|
-
FileUtils.mkdir_p lockfile_root
|
379
|
-
FileUtils.mkdir_p sockfile_path
|
380
|
-
|
381
|
-
if File.exist?(lockfile)
|
382
|
-
if !FileTest.writable?(lockfile)
|
383
|
-
raise WorkerStateError, "`#{lockfile}` not writable. Please set agent.lockfile or agent.sockfile_path in your config to a writable path."
|
384
|
-
end
|
385
|
-
else
|
386
|
-
if !FileTest.writable?(lockfile_root)
|
387
|
-
raise WorkerStateError, "`#{lockfile_root}` not writable. Please set agent.lockfile or agent.sockfile_path in your config to a writable path."
|
388
|
-
end
|
389
|
-
end
|
390
|
-
|
391
|
-
unless FileTest.writable?(sockfile_path)
|
392
|
-
raise WorkerStateError, "`#{sockfile_path}` not writable. Please set agent.sockfile_path in your config to a writable path."
|
393
|
-
end
|
394
|
-
end
|
395
|
-
|
396
|
-
def build_hello
|
397
|
-
Messages::Hello.build(VERSION, SUBPROCESS_CMD)
|
398
|
-
end
|
399
|
-
|
400
|
-
def build_queue
|
401
|
-
Util::Task.new(100, 1) { |m| writer_tick(m) }
|
402
|
-
end
|
403
|
-
|
404
|
-
def read_lockfile
|
405
|
-
pid = File.read(lockfile) rescue nil
|
406
|
-
if pid =~ /^\d+$/
|
407
|
-
pid.to_i
|
408
|
-
end
|
409
|
-
end
|
410
|
-
|
411
|
-
def sockfile(pid)
|
412
|
-
"#{sockfile_path}/skylight-#{pid}.sock"
|
413
|
-
end
|
414
|
-
|
415
|
-
def sockfile?(pid)
|
416
|
-
File.exist?(sockfile(pid))
|
417
|
-
end
|
418
|
-
|
419
|
-
end
|
420
|
-
end
|
421
|
-
end
|