skylight 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
1
  module Skylight
2
- VERSION = '0.3.7'
2
+ VERSION = '0.3.8'
3
3
  end
4
4
 
@@ -5,12 +5,14 @@ module Skylight
5
5
  CHUNK_SIZE = 16 * 1024
6
6
 
7
7
  # === Modules
8
- autoload :Builder, 'skylight/worker/builder'
9
- autoload :Collector, 'skylight/worker/collector'
10
- autoload :Connection, 'skylight/worker/connection'
11
- autoload :Embedded, 'skylight/worker/embedded'
12
- autoload :Server, 'skylight/worker/server'
13
- autoload :Standalone, 'skylight/worker/standalone'
8
+ autoload :Builder, 'skylight/worker/builder'
9
+ autoload :Collector, 'skylight/worker/collector'
10
+ autoload :Connection, 'skylight/worker/connection'
11
+ autoload :ConnectionSet, 'skylight/worker/connection_set'
12
+ autoload :Embedded, 'skylight/worker/embedded'
13
+ autoload :MetricsReporter, 'skylight/worker/metrics_reporter'
14
+ autoload :Server, 'skylight/worker/server'
15
+ autoload :Standalone, 'skylight/worker/standalone'
14
16
 
15
17
  end
16
18
  end
@@ -19,7 +19,7 @@ module Skylight
19
19
  case s
20
20
  when 'embedded'
21
21
  trace "building embedded worker"
22
- Embedded.new(Collector.new(config))
22
+ Embedded.new(Collector.build(config))
23
23
  when 'standalone'
24
24
  trace "building standalone worker"
25
25
 
@@ -11,23 +11,39 @@ module Skylight
11
11
 
12
12
  include Util::Logging
13
13
 
14
- attr_reader :config
14
+ attr_reader :config, :metrics_reporter
15
15
 
16
- def initialize(config)
16
+ def initialize(config, metrics_reporter = nil)
17
17
  super(1000, 0.25)
18
18
 
19
- @config = config
20
- @size = config[:'agent.sample']
21
- @batch = nil
22
- @interval = config[:'agent.interval']
23
- @refresh_at = 0
24
- @http_auth = Util::HTTP.new(config, :accounts)
19
+ @config = config
20
+ @size = config[:'agent.sample']
21
+ @batch = nil
22
+ @interval = config[:'agent.interval']
23
+ @refresh_at = 0
24
+ @http_auth = Util::HTTP.new(config, :accounts)
25
25
  @http_report = nil
26
- # @http_report = Util::HTTP.new(config, :report)
26
+ @report_meter = Metrics::Meter.new
27
+ @report_success_meter = Metrics::Meter.new
28
+ @metrics_reporter = metrics_reporter
29
+
30
+ @metrics_reporter.register("collector.report-rate", @report_meter)
31
+ @metrics_reporter.register("collector.report-success-rate", @report_success_meter)
27
32
 
28
33
  t { fmt "starting collector; interval=%d; size=%d", @interval, @size }
29
34
  end
30
35
 
36
+ def self.build(config)
37
+ new(config, MetricsReporter.new(config))
38
+ end
39
+
40
+ def prepare
41
+ if @metrics_reporter
42
+ @metrics_reporter.register("worker.collector.queue-depth", queue_depth_metric)
43
+ @metrics_reporter.spawn
44
+ end
45
+ end
46
+
31
47
  def handle(msg, now = Util::Clock.absolute_secs)
32
48
  @batch ||= new_batch(now)
33
49
 
@@ -61,15 +77,23 @@ module Skylight
61
77
  true
62
78
  end
63
79
 
64
- def send_status(status)
65
- post_data(:status, status)
80
+ def send_http_exception(http, response)
81
+ send_exception(response.exception, additional_info: {
82
+ host: http.host,
83
+ port: http.port,
84
+ path: response.request.path,
85
+ method: response.request.method
86
+ })
66
87
  end
67
88
 
68
- def send_exception(exception)
69
- data = {class_name: exception.class.name}
89
+ def send_exception(exception, data={})
90
+ data = { class_name: exception.class.name,
91
+ agent_info: @metrics_reporter.build_report }.merge(data)
92
+
70
93
  if Exception === exception
71
94
  data.merge!(message: exception.message, backtrace: exception.backtrace)
72
95
  end
96
+
73
97
  post_data(:exception, data, false)
74
98
  end
75
99
 
@@ -84,7 +108,12 @@ module Skylight
84
108
  warn "#{type} wasn't sent successfully; status=%s", res.status
85
109
  end
86
110
 
87
- send_exception(res.exception) if notify && res.exception
111
+ if res.exception
112
+ send_http_exception(@http_auth, res) if notify
113
+ false
114
+ else
115
+ true
116
+ end
88
117
  rescue Exception => e
89
118
  error "exception; msg=%s; class=%s", e.message, e.class
90
119
  t { e.backtrace.join("\n") }
@@ -109,6 +138,10 @@ module Skylight
109
138
  end
110
139
 
111
140
  @batch = nil
141
+ ensure
142
+ if @metrics_reporter
143
+ @metrics_reporter.shutdown
144
+ end
112
145
  end
113
146
 
114
147
  def flush(batch)
@@ -116,8 +149,16 @@ module Skylight
116
149
 
117
150
  debug "flushing batch; size=%d", batch.sample.count
118
151
 
152
+ @report_meter.mark
153
+
119
154
  res = @http_report.post(ENDPOINT, batch.encode, CONTENT_TYPE => SKYLIGHT_V2)
120
- send_exception(res.exception) if res.exception
155
+
156
+ if res.exception
157
+ send_http_exception(@http_report, res)
158
+ else
159
+ @report_success_meter.mark
160
+ end
161
+
121
162
  nil
122
163
  end
123
164
 
@@ -125,7 +166,7 @@ module Skylight
125
166
  res = @http_auth.get("/agent/authenticate?hostname=#{escape(config[:'hostname'])}")
126
167
 
127
168
  if res.exception
128
- send_exception(res.exception)
169
+ send_http_exception(@http_auth, res)
129
170
  return
130
171
  end
131
172
 
@@ -1,14 +1,18 @@
1
1
  module Skylight
2
2
  module Worker
3
+ # Represents the IPC client connection
3
4
  class Connection
4
5
  FRAME_HDR_LEN = 8
5
6
 
6
- attr_reader :sock
7
+ attr_reader :sock, :throughput
7
8
 
8
9
  def initialize(sock)
9
10
  @sock = sock
10
11
  @len = nil
11
12
  @buf = ""
13
+
14
+ # Metrics
15
+ @throughput = Metrics::Meter.new
12
16
  end
13
17
 
14
18
  def read
@@ -17,6 +21,7 @@ module Skylight
17
21
  end
18
22
 
19
23
  if chunk = read_sock
24
+
20
25
  @buf << chunk
21
26
 
22
27
  if !@len && @buf.bytesize >= FRAME_HDR_LEN
@@ -27,6 +32,10 @@ module Skylight
27
32
  end
28
33
  end
29
34
 
35
+ def cleanup
36
+ # Any cleanup code here
37
+ end
38
+
30
39
  private
31
40
 
32
41
  def read_len
@@ -66,7 +75,10 @@ module Skylight
66
75
  end
67
76
 
68
77
  def read_sock
69
- @sock.read_nonblock(CHUNK_SIZE)
78
+ ret = @sock.read_nonblock(CHUNK_SIZE)
79
+ # Track the throughput
80
+ @throughput.mark(ret.bytesize) if ret
81
+ ret
70
82
  rescue Errno::EAGAIN, Errno::EWOULDBLOCK
71
83
  end
72
84
 
@@ -0,0 +1,56 @@
1
+ require 'thread'
2
+
3
+ module Skylight
4
+ module Worker
5
+ class ConnectionSet
6
+ attr_reader :open_connections, :throughput
7
+
8
+ def initialize
9
+ @connections = {}
10
+ @lock = Mutex.new
11
+
12
+ # Metrics
13
+ @open_connections = build_open_connections_metric
14
+ @throughput = build_throughput_metric
15
+ end
16
+
17
+ def add(sock)
18
+ conn = Connection.new(sock)
19
+ @lock.synchronize { @connections[sock] = conn }
20
+ conn
21
+ end
22
+
23
+ def socks
24
+ @lock.synchronize { @connections.keys }
25
+ end
26
+
27
+ def [](sock)
28
+ @lock.synchronize do
29
+ @connections[sock]
30
+ end
31
+ end
32
+
33
+ def cleanup(sock)
34
+ if conn = @lock.synchronize { @connections.delete(sock) }
35
+ conn.cleanup
36
+ sock.close rescue nil
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def build_open_connections_metric
43
+ lambda do
44
+ @lock.synchronize { @connections.length }
45
+ end
46
+ end
47
+
48
+ def build_throughput_metric
49
+ lambda do
50
+ conns = @lock.synchronize { @connections.values }
51
+ conns.map { |c| c.throughput.rate.to_i }
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,103 @@
1
+ require 'thread'
2
+ require 'rbconfig'
3
+
4
+ module Skylight
5
+ module Worker
6
+ class MetricsReporter < Util::Task
7
+
8
+ include Util::Logging
9
+
10
+ attr_reader :config
11
+
12
+ def initialize(config)
13
+ super(1000, 0.25)
14
+
15
+ @metrics = {}
16
+ @config = config
17
+ @interval = config[:'metrics.report_interval']
18
+ @lock = Mutex.new
19
+ @next_report_at = nil
20
+ @http_auth = Util::HTTP.new(config, :accounts)
21
+ end
22
+
23
+ # A metric responds to #call and returns metric info
24
+ def register(name, metric)
25
+ @lock.synchronize { @metrics[name] = metric }
26
+ end
27
+
28
+ def unregister(name)
29
+ @lock.synchronize { @metrics.delete(name) }
30
+ end
31
+
32
+ # msg is always nil, but we can use the Task abstraction anyway
33
+ def handle(msg, now = Util::Clock.absolute_secs)
34
+ # Initially set the next report at
35
+ unless @next_report_at
36
+ update_next_report_at(now)
37
+ return true
38
+ end
39
+
40
+ if now < @next_report_at
41
+ # Nothing to do
42
+ return true
43
+ end
44
+
45
+ update_next_report_at(now)
46
+ post_report
47
+
48
+ true
49
+ end
50
+
51
+ def build_report
52
+ report = {
53
+ "hostname" => config[:'hostname'],
54
+ "host.info" => RbConfig::CONFIG['arch'],
55
+ "ruby.version" => RUBY_VERSION,
56
+ "ruby.engine" => RUBY_ENGINE,
57
+ "skylight.version" => Skylight::VERSION
58
+ }
59
+
60
+ metric_names.each do |name|
61
+ # Since we are operating in a concurrent environment, it is possible
62
+ # that the metric for the current name is unregistered before we
63
+ # access it here.
64
+ unless m = metric(name)
65
+ next
66
+ end
67
+
68
+ report[name] = m.call
69
+ end
70
+
71
+ report
72
+ end
73
+
74
+ def post_report
75
+ report = build_report
76
+
77
+ # Send the report
78
+ t { fmt "reporting internal metrics; payload=%s", report.inspect }
79
+
80
+ res = @http_auth.post("/agent/metrics", report: report)
81
+
82
+ unless res.success?
83
+ warn "internal metrics report failed; status=%s", res.status
84
+ end
85
+ end
86
+
87
+ private
88
+
89
+ def metric_names
90
+ @lock.synchronize { @metrics.keys }
91
+ end
92
+
93
+ def metric(name)
94
+ @lock.synchronize { @metrics[name] }
95
+ end
96
+
97
+ def update_next_report_at(now)
98
+ @next_report_at = now + @interval
99
+ end
100
+
101
+ end
102
+ end
103
+ end
@@ -17,7 +17,6 @@ module Skylight
17
17
  :keepalive,
18
18
  :lockfile_path,
19
19
  :sockfile_path,
20
- :status_interval,
21
20
  :last_status_update,
22
21
  :max_memory
23
22
 
@@ -26,20 +25,23 @@ module Skylight
26
25
  raise ArgumentError, "lockfile and unix domain server socket are required"
27
26
  end
28
27
 
29
- @pid = Process.pid
30
- @run = true
31
- @tick = 1
32
- @socks = []
33
- @config = config
34
- @server = srv
35
- @lockfile = lockfile
36
- @collector = Collector.new(config)
37
- @keepalive = @config[:'agent.keepalive']
38
- @connections = {}
28
+ @pid = Process.pid
29
+ @run = true
30
+ @tick = 1
31
+ @socks = []
32
+ @config = config
33
+ @server = srv
34
+ @lockfile = lockfile
35
+ @collector = Collector.build(config)
36
+ @metrics_reporter = @collector.metrics_reporter
37
+ @keepalive = @config[:'agent.keepalive']
38
+ @connections = ConnectionSet.new
39
39
  @lockfile_path = lockfile_path
40
40
  @sockfile_path = @config[:'agent.sockfile_path']
41
- @status_interval = 60
42
- @max_memory = @config[:'agent.max_memory']
41
+ @process_mem_gauge = Metrics::ProcessMemGauge.new
42
+ @process_cpu_gauge = Metrics::ProcessCpuGauge.new
43
+ @max_memory = @config[:'agent.max_memory']
44
+ @booted_at = Util::Clock.absolute_secs
43
45
  end
44
46
 
45
47
  # Called from skylight.rb on require
@@ -122,9 +124,21 @@ module Skylight
122
124
  private
123
125
 
124
126
  def init
127
+ # TODO: Not super ideal to always iterate here even if debug mode isn't
128
+ # enabled, but it's not super perf critical. We will fix when we revamp
129
+ # logging
130
+ debug "initializing server; config=%s", config.to_env
131
+
125
132
  trap('TERM') { @run = false }
126
133
  trap('INT') { @run = false }
127
134
 
135
+ # Register metrics
136
+ @metrics_reporter.register("worker.memory", @process_mem_gauge)
137
+ @metrics_reporter.register("worker.cpu", @process_cpu_gauge)
138
+ @metrics_reporter.register("worker.uptime", lambda { Util::Clock.absolute_secs - @booted_at })
139
+ @metrics_reporter.register("worker.ipc.open-connections", @connections.open_connections)
140
+ @metrics_reporter.register("worker.ipc.throughput", @connections.throughput)
141
+
128
142
  info "starting skylight daemon"
129
143
  @collector.spawn
130
144
  end
@@ -190,9 +204,8 @@ module Skylight
190
204
  sanity_check
191
205
  end
192
206
 
193
- if status_interval < now - last_status_update
194
- last_status_update = now
195
- status_check
207
+ if @process_mem_gauge.call > max_memory
208
+ raise WorkerStateError, "Memory limit exceeded: #{memory_usage} (max: #{max_memory})"
196
209
  end
197
210
  end
198
211
 
@@ -213,6 +226,9 @@ module Skylight
213
226
  end while @run
214
227
 
215
228
  true # Successful return
229
+ ensure
230
+ # Send a final metrics report
231
+ @metrics_reporter.post_report
216
232
  end
217
233
 
218
234
  # Handles an incoming message. Will be instances from
@@ -254,7 +270,7 @@ module Skylight
254
270
  def connect(sock)
255
271
  trace "client accepted"
256
272
  @socks << sock
257
- @connections[sock] = Connection.new(sock)
273
+ @connections.add(sock)
258
274
  end
259
275
 
260
276
  def cleanup
@@ -272,16 +288,15 @@ module Skylight
272
288
  end
273
289
 
274
290
  def clients_close
275
- @connections.keys.each do |sock|
291
+ @connections.socks.each do |sock|
276
292
  client_close(sock)
277
293
  end
278
294
  end
279
295
 
280
296
  def client_close(sock)
281
297
  trace "closing client connection; fd=%d", sock.fileno
282
- @connections.delete(sock)
298
+ @connections.cleanup(sock)
283
299
  @socks.delete(sock)
284
- sock.close rescue nil
285
300
  end
286
301
 
287
302
  def sockfile
@@ -315,22 +330,6 @@ module Skylight
315
330
  raise WorkerStateError, "sockfile gone"
316
331
  end
317
332
  end
318
-
319
- def status_check
320
- memory_usage = get_memory_usage
321
-
322
- @collector.send_status(memory: memory_usage, max_memory: max_memory)
323
-
324
- if memory_usage > max_memory
325
- raise WorkerStateError, "Memory limit exceeded: #{memory_usage} (max: #{max_memory})"
326
- end
327
- end
328
-
329
- def get_memory_usage
330
- `ps -o rss= -p #{Process.pid}`.to_i / 1024
331
- rescue Errno::ENOENT, Errno::EINTR
332
- 0
333
- end
334
333
  end
335
334
  end
336
335
  end