instrumental_agent 2.0.0 → 3.0.0.beta3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 7fec654f621fd1e8f11f79fca789690d2e3d9b52
4
- data.tar.gz: 078bf7a001fab6ed9f982cf818e675def3aaf65f
2
+ SHA256:
3
+ metadata.gz: 20bc1dfd989a1d555912b21cd1a43b0a2ebb235e411fed6b27746cd3c01ce2ad
4
+ data.tar.gz: 7e725c0d514db6cccba3827a76a7b73995b06084a54bf0637cf7c2423d2beb3b
5
5
  SHA512:
6
- metadata.gz: 4d2b8bdfa044822f00fb9b682817d2bc1c9108e26b544b93ee218212d7ce3b41ff88c0c422de21300e35638b7490f5a5032096d469ab322d1ab3f0c369012c63
7
- data.tar.gz: 8a3494eb5e9c5932d54a7a2192d28ac181963c5d4f98567ca5cf54aa3381582eb2bed88439df664dd88a6fd4c0689c37e5ec8489e82d0fc15fffd58702cc04a8
6
+ metadata.gz: 356538412bb4aeeb4af6dbb912de961ca587b08410a48cc60b8246e7d138909dab5f75261dd441bf4fd9287868565c2a4554a313c0b9d43316ca7ac615f38b20
7
+ data.tar.gz: f819635e2b8ea0efaa7a25ed62cdf52d54b602e8b8703a7eea3b7975f83ed557ffe0cdd1a39b5dd97942492e3a872d4e343c6c179de18feb3dbb257c08faf1a8
@@ -1 +1 @@
1
- 2.0.0-p648
1
+ 2.6.3
@@ -1,8 +1,7 @@
1
1
  sudo: false
2
2
  language: ruby
3
3
  rvm:
4
- - 2.0.0-p648
5
- - 2.1.5
6
- - 2.2.3
7
- - 2.3.0
8
- - 2.4.0
4
+ - 2.5.7
5
+ - 2.6.3
6
+ - 2.6.6
7
+ - 2.7.1
@@ -1,3 +1,18 @@
1
+ ### 3.0.0.beta [October 8, 2020]
2
+ * Drop support for outdated versions of Ruby
3
+ * Explicitly add support for new versions of Ruby
4
+ * Add support for client-side aggregation
5
+ * Note: the agent API has NOT changed. This is a major release because of the significant changes in Ruby versions officially supported.
6
+
7
+ ### 3.0.0.alpha [August 22, 2019]
8
+ * Drop support for outdated versions of Ruby
9
+ * Explicitly add support for new versions of Ruby
10
+ * Better handling of SSL errors when connecting to Instrumental
11
+ * Note: the agent API has NOT changed. This is a major release because of the significant changes in Ruby versions officially supported.
12
+
13
+ ### 2.1.0 [January 19, 2018]
14
+ * Add support for capistrano 3
15
+
1
16
  ### 2.0.0 [August 21, 2017]
2
17
  * Add automatic tracking of common application metrics, official release
3
18
 
data/Gemfile CHANGED
@@ -1,11 +1,6 @@
1
1
  source "https://rubygems.org"
2
2
 
3
3
  gemspec
4
- ruby_engine = defined?(RUBY_ENGINE) && RUBY_ENGINE
5
- if RUBY_VERSION < "1.9" && !%w{jruby rbx}.include?(ruby_engine)
6
- # Built and installed via ext/mkrf_conf.rb
7
- gem 'system_timer', '~> 1.2'
8
- end
9
4
 
10
5
  # fixes 2.3.0 ffi bundle error
11
- gem 'ffi', '~> 1.0.11'
6
+ gem 'ffi', '~> 1.0.11'
data/README.md CHANGED
@@ -59,6 +59,17 @@ User.find_each do |user|
59
59
  end
60
60
  ```
61
61
 
62
+ ## Aggregation
63
+ Aggregation collects more data on your system before sending it to Instrumental. This reduces the total amount of data being sent, at the cost of a small amount of additional latency. You can control this feature with the frequency parameter:
64
+
65
+ ```ruby
66
+ I = Instrumental::Agent.new('PROJECT_API_TOKEN', :frequency => 15) # send data every 15 seconds
67
+ I.frequency = 6 # send batches of data every 6 seconds
68
+ ```
69
+
70
+ The agent may send data more frequently if you are sending a large number of different metrics. Values between 3 and 15 are generally reasonable. If you want to disable this behavior and send every metric as fast as possible, set frequency to zero or nil. Note that a frequency of zero will still use a seperate thread for performance - it is NOT the same as synchronous mode.
71
+
72
+
62
73
  ## Server Metrics
63
74
 
64
75
  Want server stats like load, memory, etc.? Check out [InstrumentalD](https://github.com/instrumental/instrumentald).
@@ -109,6 +120,17 @@ I = Instrumental::Agent.new('PROJECT_API_TOKEN',
109
120
  )
110
121
  ```
111
122
 
123
+ ### Upgrading from 2.x
124
+
125
+ Agent version 3.x drops support for some older rubies, but should otherwise be a drop-in replacement. If you wish to enable Aggregation, enable the agent with the frequency option set to the number of seconds you would like to wait between flushes. For example:
126
+
127
+ ```
128
+ I = Instrumental::Agent.new('PROJECT_API_TOKEN',
129
+ :enabled => Rails.env.production?,
130
+ :frequency => 15
131
+ )
132
+ ```
133
+
112
134
  ## Troubleshooting & Help
113
135
 
114
136
  We are here to help. Email us at [support@instrumentalapp.com](mailto:support@instrumentalapp.com).
@@ -4,13 +4,13 @@ require "instrumental/version"
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "instrumental_agent"
6
6
  s.version = Instrumental::VERSION
7
- s.authors = ["Elijah Miller", "Christopher Zelenak", "Kristopher Chambers", "Matthew Hassfurder"]
7
+ s.authors = ["Expected Behavior"]
8
8
  s.email = ["support@instrumentalapp.com"]
9
9
  s.homepage = "http://github.com/instrumental/instrumental_agent-ruby"
10
10
  s.summary = %q{Custom metric monitoring for Ruby applications via Instrumental}
11
11
  s.description = %q{This agent supports Instrumental custom metric monitoring for Ruby applications. It provides high-data reliability at high scale, without ever blocking your process or causing an exception.}
12
12
  s.license = "MIT"
13
- s.required_ruby_version = '>= 2.0.0'
13
+ s.required_ruby_version = '>= 2.5.7'
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
16
16
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -1,5 +1,7 @@
1
1
  require 'instrumental/version'
2
2
  require 'instrumental/system_timer'
3
+ require 'instrumental/command_structs'
4
+ require 'instrumental/event_aggregator'
3
5
  require 'logger'
4
6
  require 'openssl' rescue nil
5
7
  require 'resolv'
@@ -15,14 +17,17 @@ module Instrumental
15
17
  EXIT_FLUSH_TIMEOUT = 5
16
18
  HOSTNAME = Socket.gethostbyname(Socket.gethostname).first rescue Socket.gethostname
17
19
  MAX_BUFFER = 5000
20
+ MAX_AGGREGATOR_SIZE = 5000
18
21
  MAX_RECONNECT_DELAY = 15
19
22
  REPLY_TIMEOUT = 10
20
23
  RESOLUTION_FAILURES_BEFORE_WAITING = 3
21
24
  RESOLUTION_WAIT = 30
22
25
  RESOLVE_TIMEOUT = 1
26
+ DEFAULT_FREQUENCY = 0
27
+ VALID_FREQUENCIES = [0, 1, 2, 3, 4, 5, 6, 10, 12, 15, 20, 30, 60]
23
28
 
24
29
 
25
- attr_accessor :host, :port, :synchronous, :queue, :dns_resolutions, :last_connect_at
30
+ attr_accessor :host, :port, :synchronous, :frequency, :sender_queue, :aggregator_queue, :dns_resolutions, :last_connect_at
26
31
  attr_reader :connection, :enabled, :secure
27
32
 
28
33
  def self.logger=(l)
@@ -52,6 +57,7 @@ module Instrumental
52
57
  # port: 8001
53
58
  # enabled: true
54
59
  # synchronous: false
60
+ # frequency: 10
55
61
  # secure: true
56
62
  # verify: true
57
63
  @api_key = api_key
@@ -73,14 +79,23 @@ module Instrumental
73
79
  @port = (@port || default_port).to_i
74
80
  @enabled = options.has_key?(:enabled) ? !!options[:enabled] : true
75
81
  @synchronous = !!options[:synchronous]
82
+
83
+ if options.has_key?(:frequency)
84
+ self.frequency = options[:frequency]
85
+ else
86
+ self.frequency = DEFAULT_FREQUENCY
87
+ end
88
+
89
+ @metrician = options[:metrician].nil? ? true : !!options[:metrician]
76
90
  @pid = Process.pid
77
91
  @allow_reconnect = true
78
- @certs = certificates
79
92
  @dns_resolutions = 0
80
93
  @last_connect_at = 0
81
- @metrician = options[:metrician].nil? ? true : !!options[:metrician]
94
+
82
95
  @start_worker_mutex = Mutex.new
83
- @queue = Queue.new
96
+ @aggregator_queue = Queue.new
97
+ @sender_queue = Queue.new
98
+
84
99
 
85
100
  setup_cleanup_at_exit if @enabled
86
101
 
@@ -94,7 +109,9 @@ module Instrumental
94
109
  # agent.gauge('load', 1.23)
95
110
  def gauge(metric, value, time = Time.now, count = 1)
96
111
  if valid?(metric, value, time, count) &&
97
- send_command("gauge", metric, value, time.to_i, count.to_i)
112
+ send_command(Instrumental::Command.new("gauge".freeze, metric, value, time, count))
113
+ # tempted to "gauge" this to a symbol? Don't. Frozen strings are very fast,
114
+ # and later we're going to to_s every one of these anyway.
98
115
  value
99
116
  else
100
117
  nil
@@ -142,7 +159,7 @@ module Instrumental
142
159
  # agent.increment('users')
143
160
  def increment(metric, value = 1, time = Time.now, count = 1)
144
161
  if valid?(metric, value, time, count) &&
145
- send_command("increment", metric, value, time.to_i, count.to_i)
162
+ send_command(Instrumental::Command.new("increment".freeze, metric, value, time, count))
146
163
  value
147
164
  else
148
165
  nil
@@ -157,7 +174,7 @@ module Instrumental
157
174
  # agent.notice('A notice')
158
175
  def notice(note, time = Time.now, duration = 0)
159
176
  if valid_note?(note)
160
- send_command("notice", time.to_i, duration.to_i, note)
177
+ send_command(Instrumental::Notice.new(note, time, duration))
161
178
  note
162
179
  else
163
180
  nil
@@ -196,6 +213,22 @@ module Instrumental
196
213
  @logger || self.class.logger
197
214
  end
198
215
 
216
+ def frequency=(frequency)
217
+ freq = frequency.to_i
218
+ if !VALID_FREQUENCIES.include?(freq)
219
+ logger.warn "Frequency must be a value that divides evenly into 60: 1, 2, 3, 4, 5, 6, 10, 12, 15, 20, 30, or 60."
220
+ # this will make all negative numbers and nils into 0s
221
+ freq = VALID_FREQUENCIES.select{ |f| f < freq }.max.to_i
222
+ end
223
+
224
+ @frequency = if(@synchronous)
225
+ logger.warn "Synchronous and Frequency should not be enabled at the same time! Defaulting to synchronous mode."
226
+ 0
227
+ else
228
+ freq
229
+ end
230
+ end
231
+
199
232
  # Stopping the agent will immediately stop all communication
200
233
  # to Instrumental. If you call this and submit another metric,
201
234
  # the agent will start again.
@@ -207,12 +240,19 @@ module Instrumental
207
240
  #
208
241
  def stop
209
242
  disconnect
210
- if @thread
211
- @thread.kill
212
- @thread = nil
243
+ if @sender_thread
244
+ @sender_thread.kill
245
+ @sender_thread = nil
246
+ end
247
+ if @aggregator_thread
248
+ @aggregator_thread.kill
249
+ @aggregator_thread = nil
213
250
  end
214
- if @queue
215
- @queue.clear
251
+ if @sender_queue
252
+ @sender_queue.clear
253
+ end
254
+ if @aggregator_queue
255
+ @aggregator_queue.clear
216
256
  end
217
257
  end
218
258
 
@@ -222,18 +262,25 @@ module Instrumental
222
262
  # where at_exit is bypassed like Resque workers.
223
263
  def cleanup
224
264
  if running?
225
- logger.info "Cleaning up agent, queue size: #{@queue.size}, thread running: #{@thread.alive?}"
265
+ logger.info "Cleaning up agent, aggregator_size: #{@aggregator_queue.size}, thread_running: #{@aggregator_thread.alive?}"
266
+ logger.info "Cleaning up agent, queue size: #{@sender_queue.size}, thread running: #{@sender_thread.alive?}"
226
267
  @allow_reconnect = false
227
- if @queue.size > 0
228
- queue_message('exit')
229
- begin
230
- with_timeout(EXIT_FLUSH_TIMEOUT) { @thread.join }
231
- rescue Timeout::Error
232
- if @queue.size > 0
233
- logger.error "Timed out working agent thread on exit, dropping #{@queue.size} metrics"
234
- else
235
- logger.error "Timed out Instrumental Agent, exiting"
236
- end
268
+ begin
269
+ with_timeout(EXIT_FLUSH_TIMEOUT) do
270
+ @aggregator_queue << ['exit']
271
+ @aggregator_thread.join
272
+ @sender_queue << ['exit']
273
+ @sender_thread.join
274
+ end
275
+ rescue Timeout::Error
276
+ total_size = @sender_queue&.size.to_i +
277
+ @aggregator_queue&.size.to_i +
278
+ @event_aggregator&.size.to_i
279
+
280
+ if total_size > 0
281
+ logger.error "Timed out working agent thread on exit, dropping #{total_size} metrics"
282
+ else
283
+ logger.error "Timed out Instrumental Agent, exiting"
237
284
  end
238
285
  end
239
286
  end
@@ -271,7 +318,8 @@ module Instrumental
271
318
  end
272
319
 
273
320
  def report_exception(e)
274
- logger.error "Exception occurred: #{e.message}\n#{e.backtrace.join("\n")}"
321
+ # puts "--- Exception of type #{e.class} occurred:\n#{e.message}\n#{e.backtrace.join("\n")}"
322
+ logger.error "Exception of type #{e.class} occurred:\n#{e.message}\n#{e.backtrace.join("\n")}"
275
323
  end
276
324
 
277
325
  def ipv4_address_for_host(host, port, moment_to_connect = Time.now.to_i)
@@ -291,44 +339,41 @@ module Instrumental
291
339
  nil
292
340
  end
293
341
 
294
- def send_command(cmd, *args)
295
- cmd = "%s %s\n" % [cmd, args.collect { |a| a.to_s }.join(" ")]
296
- if enabled?
297
-
298
- start_connection_worker
299
- if @queue && @queue.size < MAX_BUFFER
300
- @queue_full_warning = false
301
- logger.debug "Queueing: #{cmd.chomp}"
302
- queue_message(cmd, { :synchronous => @synchronous })
303
- else
304
- if !@queue_full_warning
305
- @queue_full_warning = true
306
- logger.warn "Queue full(#{@queue.size}), dropping commands..."
307
- end
308
- logger.debug "Dropping command, queue full(#{@queue.size}): #{cmd.chomp}"
309
- nil
310
- end
342
+ def send_command(command)
343
+ return logger.debug(command.to_s) unless enabled?
344
+ start_workers
345
+ critical_queue = frequency.to_i == 0 ? @sender_queue : @aggregator_queue
346
+ if critical_queue && critical_queue.size < MAX_BUFFER
347
+ @queue_full_warning = false
348
+ logger.debug "Queueing: #{command.to_s}"
349
+ queue_message(command, { :synchronous => @synchronous })
311
350
  else
312
- logger.debug cmd.strip
351
+ if !@queue_full_warning
352
+ @queue_full_warning = true
353
+ logger.warn "Queue full(#{critical_queue.size}), dropping commands..."
354
+ end
355
+ logger.debug "Dropping command, queue full(#{critical_queue.size}): #{command.to_s}"
356
+ nil
313
357
  end
314
358
  end
315
359
 
316
360
  def queue_message(message, options = {})
317
- if @enabled
318
- options ||= {}
319
- if options[:allow_reconnect].nil?
320
- options[:allow_reconnect] = @allow_reconnect
321
- end
322
- synchronous = options.delete(:synchronous)
323
- if synchronous
324
- options[:sync_resource] ||= ConditionVariable.new
325
- @sync_mutex.synchronize {
326
- @queue << [message, options]
327
- options[:sync_resource].wait(@sync_mutex)
328
- }
329
- else
330
- @queue << [message, options]
331
- end
361
+ return message unless enabled?
362
+
363
+ # imagine it's a reverse merge, but with fewer allocations
364
+ options[:allow_reconnect] = @allow_reconnect unless options.has_key?(:allow_reconnect)
365
+
366
+ if options.delete(:synchronous)
367
+ options[:sync_resource] ||= ConditionVariable.new
368
+ @sync_mutex.synchronize {
369
+ queue = message == "flush" ? @aggregator_queue : @sender_queue
370
+ queue << [message, options]
371
+ options[:sync_resource].wait(@sync_mutex)
372
+ }
373
+ elsif frequency.to_i == 0
374
+ @sender_queue << [message, options]
375
+ else
376
+ @aggregator_queue << [message, options]
332
377
  end
333
378
  message
334
379
  end
@@ -350,31 +395,15 @@ module Instrumental
350
395
 
351
396
  def test_connection
352
397
  begin
353
- # In the case where the socket is an OpenSSL::SSL::SSLSocket,
354
- # on Ruby 1.8.6, 1.8.7 or 1.9.1, read_nonblock does not exist,
355
- # and so the case of testing socket liveliness via a nonblocking
356
- # read that catches a wait condition won't work.
357
- #
358
- # We grab the SSL socket's underlying IO object and perform the
359
- # non blocking read there in order to ensure the socket is still
360
- # valid
361
- if @socket.respond_to?(:read_nonblock)
362
- @socket.read_nonblock(1)
363
- elsif @socket.respond_to?(:io)
364
- # The SSL Socket may send down additional data at close time,
365
- # so we perform two nonblocking reads, one to pull any pending
366
- # data on the socket, and the second to actually perform the connection
367
- # liveliness test
368
- @socket.io.read_nonblock(1024) && @socket.io.read_nonblock(1024)
369
- end
398
+ @socket.read_nonblock(1)
370
399
  rescue *wait_exceptions
371
400
  # noop
372
401
  end
373
402
  end
374
403
 
375
- def start_connection_worker
404
+ def start_workers
376
405
  # NOTE: We need a mutex around both `running?` and thread creation,
377
- # otherwise we could create two threads.
406
+ # otherwise we could create too many threads.
378
407
  # Return early and queue the message if another thread is
379
408
  # starting the worker.
380
409
  return if !@start_worker_mutex.try_lock
@@ -384,13 +413,34 @@ module Instrumental
384
413
  disconnect
385
414
  address = ipv4_address_for_host(@host, @port)
386
415
  if address
387
- @pid = Process.pid
416
+ new_pid = if @pid != Process.pid
417
+ @pid = Process.pid
418
+ true
419
+ else
420
+ false
421
+ end
422
+
388
423
  @sync_mutex = Mutex.new
389
424
  @failures = 0
390
425
  @sockaddr_in = Socket.pack_sockaddr_in(@port, address)
391
- logger.info "Starting thread"
392
- @thread = Thread.new do
393
- run_worker_loop
426
+
427
+ logger.info "Starting aggregator thread"
428
+ if !@aggregator_thread&.alive?
429
+ if new_pid
430
+ @event_aggregator = nil
431
+ @aggregator_queue = Queue.new
432
+ end
433
+ @aggregator_thread = Thread.new do
434
+ run_aggregator_loop
435
+ end
436
+ end
437
+
438
+ if !@sender_thread&.alive?
439
+ logger.info "Starting sender thread"
440
+ @sender_queue = Queue.new if new_pid
441
+ @sender_thread = Thread.new do
442
+ run_sender_loop
443
+ end
394
444
  end
395
445
  end
396
446
  ensure
@@ -426,82 +476,155 @@ module Instrumental
426
476
  sock
427
477
  end
428
478
 
429
- def run_worker_loop
430
- command_and_args = nil
431
- command_options = nil
432
- logger.info "connecting to collector"
433
- with_timeout(CONNECT_TIMEOUT) do
434
- @socket = open_socket(@sockaddr_in, @secure, @verify_cert)
479
+ def run_aggregator_loop
480
+ # if the sender queue is some level of full, should we keep aggregating until it empties out?
481
+ # what does this mean for aggregation slices - aggregating to nearest frequency will
482
+ # make the object needlessly larger, when minute resolution is what we have on the server
483
+ begin
484
+ loop do
485
+ now = Time.now.to_i
486
+ time_to_wait = if frequency == 0
487
+ 0
488
+ else
489
+ next_frequency = (now - (now % frequency)) + frequency
490
+ time_to_wait = [(next_frequency - Time.now.to_f), 0].max
491
+ end
492
+
493
+ command_and_args, command_options = if @event_aggregator&.size.to_i > MAX_AGGREGATOR_SIZE
494
+ logger.info "Aggregator full, flushing early with #{MAX_AGGREGATOR_SIZE} metrics."
495
+ command_and_args, command_options = ['forward', {}]
496
+ else
497
+ begin
498
+ with_timeout(time_to_wait) do
499
+ @aggregator_queue.pop
500
+ end
501
+ rescue Timeout::Error
502
+ ['forward', {}]
503
+ end
504
+ end
505
+ if command_and_args
506
+ case command_and_args
507
+ when 'exit'
508
+ if !@event_aggregator.nil?
509
+ @sender_queue << @event_aggregator
510
+ @event_aggregator = nil
511
+ end
512
+ logger.info "Exiting, #{@aggregator_queue.size} commands remain"
513
+ return true
514
+ when 'flush'
515
+ if !@event_aggregator.nil?
516
+ @sender_queue << @event_aggregator
517
+ @event_aggregator = nil
518
+ end
519
+ @sender_queue << ['flush', command_options]
520
+ when 'forward'
521
+ if !@event_aggregator.nil?
522
+ next if @sender_queue.size > 0 && @sender_queue.num_waiting < 1
523
+ @sender_queue << @event_aggregator
524
+ @event_aggregator = nil
525
+ end
526
+ when Notice
527
+ @sender_queue << [command_and_args, command_options]
528
+ else
529
+ @event_aggregator = EventAggregator.new(frequency: @frequency) if @event_aggregator.nil?
530
+
531
+ logger.debug "Sending: #{command_and_args} to aggregator"
532
+ @event_aggregator.put(command_and_args)
533
+ end
534
+ command_and_args = nil
535
+ command_options = nil
536
+ end
537
+ end
538
+ rescue Exception => err
539
+ report_exception(err)
435
540
  end
436
- logger.info "connected to collector at #{host}:#{port}"
437
- hello_options = {
438
- "version" => "ruby/instrumental_agent/#{VERSION}",
439
- "hostname" => HOSTNAME,
440
- "pid" => Process.pid,
441
- "runtime" => "#{defined?(RUBY_ENGINE) ? RUBY_ENGINE : "ruby"}/#{RUBY_VERSION}p#{RUBY_PATCHLEVEL}",
442
- "platform" => RUBY_PLATFORM
443
- }.to_a.flatten.map { |v| v.to_s.gsub(/\s+/, "_") }.join(" ")
444
-
445
- send_with_reply_timeout "hello #{hello_options}"
446
- send_with_reply_timeout "authenticate #{@api_key}"
541
+ end
542
+
543
+ def run_sender_loop
447
544
  @failures = 0
448
- loop do
449
- command_and_args, command_options = @queue.pop
450
- if command_and_args
451
- sync_resource = command_options && command_options[:sync_resource]
452
- test_connection
453
- case command_and_args
454
- when 'exit'
455
- logger.info "Exiting, #{@queue.size} commands remain"
456
- return true
457
- when 'flush'
458
- release_resource = true
459
- else
460
- logger.debug "Sending: #{command_and_args.chomp}"
461
- @socket.puts command_and_args
462
- end
463
- command_and_args = nil
464
- command_options = nil
465
- if sync_resource
466
- @sync_mutex.synchronize do
467
- sync_resource.signal
545
+ begin
546
+ logger.info "connecting to collector"
547
+ command_and_args = nil
548
+ command_options = nil
549
+ with_timeout(CONNECT_TIMEOUT) do
550
+ @socket = open_socket(@sockaddr_in, @secure, @verify_cert)
551
+ end
552
+ logger.info "connected to collector at #{host}:#{port}"
553
+ hello_options = {
554
+ "version" => "ruby/instrumental_agent/#{VERSION}",
555
+ "hostname" => HOSTNAME,
556
+ "pid" => Process.pid,
557
+ "runtime" => "#{defined?(RUBY_ENGINE) ? RUBY_ENGINE : "ruby"}/#{RUBY_VERSION}p#{RUBY_PATCHLEVEL}",
558
+ "platform" => RUBY_PLATFORM
559
+ }.to_a.flatten.map { |v| v.to_s.gsub(/\s+/, "_") }.join(" ")
560
+
561
+ send_with_reply_timeout "hello #{hello_options}"
562
+ send_with_reply_timeout "authenticate #{@api_key}"
563
+
564
+ loop do
565
+ command_and_args, command_options = @sender_queue.pop
566
+ if command_and_args
567
+ sync_resource = command_options && command_options[:sync_resource]
568
+ test_connection
569
+ case command_and_args
570
+ when 'exit'
571
+ logger.info "Exiting, #{@sender_queue.size} commands remain"
572
+ return true
573
+ when 'flush'
574
+ release_resource = true
575
+ when EventAggregator
576
+ command_and_args.values.values.each do |command|
577
+ logger.debug "Sending: #{command}"
578
+ @socket.puts command
579
+ end
580
+ else
581
+ logger.debug "Sending: #{command_and_args}"
582
+ @socket.puts command_and_args
583
+ end
584
+ command_and_args = nil
585
+ command_options = nil
586
+ if sync_resource
587
+ @sync_mutex.synchronize do
588
+ sync_resource.signal
589
+ end
468
590
  end
469
591
  end
470
592
  end
471
- end
472
- rescue Exception => err
473
- allow_reconnect = @allow_reconnect
474
- case err
475
- when EOFError
593
+ rescue Exception => err
594
+ allow_reconnect = @allow_reconnect
595
+ case err
596
+ when EOFError
476
597
  # nop
477
- when Errno::ECONNREFUSED, Errno::EHOSTUNREACH, Errno::EADDRINUSE, Timeout::Error
478
- # If the connection has been refused by Instrumental
479
- # or we cannot reach the server
480
- # or the connection state of this socket is in a race
481
- logger.error "unable to connect to Instrumental, hanging up with #{@queue.size} messages remaining"
482
- logger.debug "Exception: #{err.inspect}\n#{err.backtrace.join("\n")}"
483
- allow_reconnect = false
484
- else
485
- report_exception(err)
486
- end
487
- if allow_reconnect == false ||
488
- (command_options && command_options[:allow_reconnect] == false)
489
- logger.info "Not trying to reconnect"
490
- @failures = 0
491
- return
492
- end
493
- if command_and_args
494
- logger.debug "requeueing: #{command_and_args}"
495
- @queue << command_and_args
598
+ when Errno::ECONNREFUSED, Errno::EHOSTUNREACH, Errno::EADDRINUSE, Timeout::Error, OpenSSL::SSL::SSLError
599
+ # If the connection has been refused by Instrumental
600
+ # or we cannot reach the server
601
+ # or the connection state of this socket is in a race
602
+ # or SSL is not functioning properly for some reason
603
+ logger.error "unable to connect to Instrumental, hanging up with #{@sender_queue.size} messages remaining"
604
+ logger.debug "Exception: #{err.inspect}\n#{err.backtrace.join("\n")}"
605
+ allow_reconnect = false
606
+ else
607
+ report_exception(err)
608
+ end
609
+ if allow_reconnect == false ||
610
+ (command_options && command_options[:allow_reconnect] == false)
611
+ logger.info "Not trying to reconnect"
612
+ @failures = 0
613
+ return
614
+ end
615
+ if command_and_args
616
+ logger.debug "requeueing: #{command_and_args}"
617
+ @sender_queue << command_and_args
618
+ end
619
+ disconnect
620
+ @failures += 1
621
+ delay = [(@failures - 1) ** BACKOFF, MAX_RECONNECT_DELAY].min
622
+ logger.error "disconnected, #{@failures} failures in a row, reconnect in #{delay}..."
623
+ sleep delay
624
+ retry
625
+ ensure
626
+ disconnect
496
627
  end
497
- disconnect
498
- @failures += 1
499
- delay = [(@failures - 1) ** BACKOFF, MAX_RECONNECT_DELAY].min
500
- logger.error "disconnected, #{@failures} failures in a row, reconnect in #{delay}..."
501
- sleep delay
502
- retry
503
- ensure
504
- disconnect
505
628
  end
506
629
 
507
630
  def setup_cleanup_at_exit
@@ -511,7 +634,11 @@ module Instrumental
511
634
  end
512
635
 
513
636
  def running?
514
- !@thread.nil? && @pid == Process.pid && @thread.alive?
637
+ !@sender_thread.nil? &&
638
+ !@aggregator_thread.nil? &&
639
+ @pid == Process.pid &&
640
+ @sender_thread.alive? &&
641
+ @aggregator_thread.alive?
515
642
  end
516
643
 
517
644
  def flush_socket(socket)
@@ -541,18 +668,5 @@ module Instrumental
541
668
  def allows_secure?
542
669
  defined?(OpenSSL)
543
670
  end
544
-
545
- def certificates
546
- if allows_secure?
547
- base_dir = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
548
- %w{equifax geotrust rapidssl}.map do |name|
549
- OpenSSL::X509::Certificate.new(File.open(File.join(base_dir, "certs", "#{name}.ca.pem")))
550
- end
551
- else
552
- []
553
- end
554
- end
555
-
556
671
  end
557
-
558
672
  end