instrumental_agent 2.0.0.alpha → 3.0.0.beta2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.ruby-version +1 -1
- data/.travis.yml +4 -5
- data/CHANGELOG.md +18 -0
- data/Gemfile +1 -6
- data/README.md +22 -0
- data/instrumental_agent.gemspec +2 -2
- data/lib/instrumental/agent.rb +277 -163
- data/lib/instrumental/capistrano.rb +4 -46
- data/lib/instrumental/capistrano/capistrano2.rb +47 -0
- data/lib/instrumental/capistrano/capistrano3.rake +56 -0
- data/lib/instrumental/command_structs.rb +32 -0
- data/lib/instrumental/event_aggregator.rb +26 -0
- data/lib/instrumental/version.rb +1 -1
- data/spec/agent_spec.rb +436 -43
- data/spec/command_struct_specs.rb +20 -0
- data/spec/event_aggregator_spec.rb +53 -0
- data/spec/spec_helper.rb +9 -0
- metadata +34 -33
- data/certs/equifax.ca.pem +0 -69
- data/certs/geotrust.ca.pem +0 -80
- data/certs/rapidssl.ca.pem +0 -94
@@ -1,47 +1,5 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
if Capistrano::Configuration.instance
|
6
|
-
Capistrano::Configuration.instance.load do
|
7
|
-
namespace :instrumental do
|
8
|
-
namespace :util do
|
9
|
-
desc "marker for beginning of deploy"
|
10
|
-
task :deploy_start do
|
11
|
-
set :instrumental_deploy_start, Time.now
|
12
|
-
end
|
13
|
-
|
14
|
-
desc "marker for end of deploy"
|
15
|
-
task :deploy_end do
|
16
|
-
set :instrumental_deploy_end, Time.now
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
desc "send a notice to instrumental about the deploy"
|
21
|
-
task :record_deploy_notice do
|
22
|
-
start_at = exists?(:instrumental_deploy_start) ? instrumental_deploy_start : Time.now
|
23
|
-
end_at = exists?(:instrumental_deploy_end) ? instrumental_deploy_end : start_at
|
24
|
-
deploy_duration_in_seconds = end_at - start_at
|
25
|
-
deployer = Etc.getlogin.chomp
|
26
|
-
agent_options = { :synchronous => true }
|
27
|
-
agent_options[:collector] = instrumental_host if exists?(:instrumental_host)
|
28
|
-
agent = Instrumental::Agent.new(instrumental_key, agent_options)
|
29
|
-
message = if exists?(:deploy_message)
|
30
|
-
deploy_message
|
31
|
-
else
|
32
|
-
"#{deployer} deployed #{current_revision}"
|
33
|
-
end
|
34
|
-
agent.notice(message,
|
35
|
-
start_at,
|
36
|
-
deploy_duration_in_seconds)
|
37
|
-
logger.info("Notified Instrumental of deployment")
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
before "deploy", "instrumental:util:deploy_start"
|
42
|
-
after "deploy", "instrumental:util:deploy_end"
|
43
|
-
before "deploy:migrations", "instrumental:util:deploy_start"
|
44
|
-
after "deploy:migrations", "instrumental:util:deploy_end"
|
45
|
-
after "instrumental:util:deploy_end", "instrumental:record_deploy_notice"
|
46
|
-
end
|
1
|
+
if Gem::Specification.find_by_name("capistrano").version >= Gem::Version.new("3.0.0")
|
2
|
+
load File.expand_path("../capistrano/capistrano3.rake", __FILE__)
|
3
|
+
else
|
4
|
+
require_relative "capistrano/capistrano2"
|
47
5
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "etc"
|
2
|
+
require "instrumental_agent"
|
3
|
+
|
4
|
+
Capistrano::Configuration.instance.load do
|
5
|
+
_cset(:instrumental_hooks) { true }
|
6
|
+
_cset(:instrumental_key) { nil }
|
7
|
+
_cset(:deployer) { Etc.getlogin.chomp }
|
8
|
+
|
9
|
+
if fetch(:instrumental_hooks)
|
10
|
+
before "deploy", "instrumental:util:deploy_start"
|
11
|
+
after "deploy", "instrumental:util:deploy_end"
|
12
|
+
before "deploy:migrations", "instrumental:util:deploy_start"
|
13
|
+
after "deploy:migrations", "instrumental:util:deploy_end"
|
14
|
+
after "instrumental:util:deploy_end", "instrumental:record_deploy_notice"
|
15
|
+
end
|
16
|
+
|
17
|
+
namespace :instrumental do
|
18
|
+
namespace :util do
|
19
|
+
desc "marker for beginning of deploy"
|
20
|
+
task :deploy_start do
|
21
|
+
set :instrumental_deploy_start, Time.now
|
22
|
+
end
|
23
|
+
|
24
|
+
desc "marker for end of deploy"
|
25
|
+
task :deploy_end do
|
26
|
+
set :instrumental_deploy_end, Time.now
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
desc "send a notice to instrumental about the deploy"
|
31
|
+
task :record_deploy_notice do
|
32
|
+
start_at = fetch(:instrumental_deploy_start, Time.now)
|
33
|
+
end_at = fetch(:instrumental_deploy_end, start_at)
|
34
|
+
deploy_duration_in_seconds = end_at - start_at
|
35
|
+
deployer = fetch(:deployer)
|
36
|
+
agent_options = { :synchronous => true }
|
37
|
+
agent_options[:collector] = instrumental_host if fetch(:instrumental_host, false)
|
38
|
+
agent = Instrumental::Agent.new(fetch(:instrumental_key), agent_options)
|
39
|
+
message = fetch(:deploy_message, "#{deployer} deployed #{current_revision}")
|
40
|
+
|
41
|
+
agent.notice(message,
|
42
|
+
start_at,
|
43
|
+
deploy_duration_in_seconds)
|
44
|
+
logger.info("Notified Instrumental of deployment")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "etc"
|
2
|
+
require "instrumental_agent"
|
3
|
+
|
4
|
+
namespace :load do
|
5
|
+
task :defaults do
|
6
|
+
set :instrumental_hooks, true
|
7
|
+
set :instrumental_key, nil
|
8
|
+
set :deployer, Etc.getlogin.chomp
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
namespace :deploy do
|
13
|
+
before :starting, :check_instrumental_hooks do
|
14
|
+
invoke "instrumental:util:add_hooks" if fetch(:instrumental_hooks)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
namespace :instrumental do
|
19
|
+
namespace :util do
|
20
|
+
desc "add instrumental hooks to deploy"
|
21
|
+
task :add_hooks do
|
22
|
+
before "deploy", "instrumental:util:deploy_start"
|
23
|
+
after "deploy", "instrumental:util:deploy_end"
|
24
|
+
after "instrumental:util:deploy_end", "instrumental:record_deploy_notice"
|
25
|
+
end
|
26
|
+
|
27
|
+
desc "marker for beginning of deploy"
|
28
|
+
task :deploy_start do
|
29
|
+
set :instrumental_deploy_start, Time.now
|
30
|
+
end
|
31
|
+
|
32
|
+
desc "marker for end of deploy"
|
33
|
+
task :deploy_end do
|
34
|
+
set :instrumental_deploy_end, Time.now
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
desc "send a notice to instrumental about the deploy"
|
39
|
+
task :record_deploy_notice do
|
40
|
+
start_at = fetch(:instrumental_deploy_start, Time.now)
|
41
|
+
end_at = fetch(:instrumental_deploy_end, start_at)
|
42
|
+
deploy_duration_in_seconds = end_at - start_at
|
43
|
+
deployer = fetch(:deployer)
|
44
|
+
agent_options = { :synchronous => true }
|
45
|
+
agent_options[:collector] = instrumental_host if fetch(:instrumental_host, false)
|
46
|
+
message = fetch(:deploy_message, "#{deployer} deployed #{fetch(:current_revision)}".strip)
|
47
|
+
|
48
|
+
if fetch(:instrumental_key)
|
49
|
+
agent = Instrumental::Agent.new(fetch(:instrumental_key), agent_options)
|
50
|
+
agent.notice(message,
|
51
|
+
start_at,
|
52
|
+
deploy_duration_in_seconds)
|
53
|
+
puts "Notified Instrumental of deployment"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Instrumental
|
2
|
+
METRIC_TYPES = ["increment".freeze, "gauge".freeze].freeze
|
3
|
+
|
4
|
+
Command = Struct.new(:command, :metric, :value, :time, :count) do
|
5
|
+
def initialize(command, metric, value, time, count)
|
6
|
+
super(command, metric, value, time.to_i, count.to_i)
|
7
|
+
end
|
8
|
+
|
9
|
+
def to_s
|
10
|
+
[command, metric, value, time, count].map(&:to_s).join(" ")
|
11
|
+
end
|
12
|
+
|
13
|
+
def metadata
|
14
|
+
"#{metric}:#{time}".freeze
|
15
|
+
end
|
16
|
+
|
17
|
+
def +(other_command)
|
18
|
+
return self if other_command.nil?
|
19
|
+
Command.new(command, metric, value + other_command.value, time, count + other_command.count)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
Notice = Struct.new(:note, :time, :duration) do
|
24
|
+
def initialize(note, time, duration)
|
25
|
+
super(note, time.to_i, duration.to_i)
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s
|
29
|
+
["notice".freeze, time, duration, note].map(&:to_s).join(" ")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
class EventAggregator
|
2
|
+
attr_accessor :counts, :values, :received_at, :frequency
|
3
|
+
|
4
|
+
def initialize(frequency:)
|
5
|
+
@values = Hash.new
|
6
|
+
@frequency = frequency
|
7
|
+
end
|
8
|
+
|
9
|
+
def put(command)
|
10
|
+
command_at = command.time
|
11
|
+
unless(command_at % frequency == 0)
|
12
|
+
command.time = (command_at - (command_at % frequency))
|
13
|
+
end
|
14
|
+
metadata = command.metadata
|
15
|
+
@values[metadata] = (command + @values[metadata])
|
16
|
+
end
|
17
|
+
|
18
|
+
def size
|
19
|
+
@values.size
|
20
|
+
end
|
21
|
+
|
22
|
+
def coerce_time(time)
|
23
|
+
itime = time.to_i
|
24
|
+
(itime - (itime % frequency)).to_i
|
25
|
+
end
|
26
|
+
end
|
data/lib/instrumental/version.rb
CHANGED
data/spec/agent_spec.rb
CHANGED
@@ -9,6 +9,8 @@ def wait(n=0.2, &block)
|
|
9
9
|
if (Time.now - start) < 5
|
10
10
|
sleep n
|
11
11
|
retry
|
12
|
+
else
|
13
|
+
raise ex
|
12
14
|
end
|
13
15
|
end
|
14
16
|
else
|
@@ -37,7 +39,8 @@ shared_examples "Instrumental Agent" do
|
|
37
39
|
let(:token) { 'test_token' }
|
38
40
|
let(:address) { server.host_and_port }
|
39
41
|
let(:metrician) { false }
|
40
|
-
let(:
|
42
|
+
let(:frequency) { 0 }
|
43
|
+
let(:agent) { Instrumental::Agent.new(token, :collector => address, :synchronous => synchronous, :enabled => enabled, :secure => secure?, :verify_cert => verify_cert?, :metrician => metrician, :frequency => frequency) }
|
41
44
|
|
42
45
|
# Server options
|
43
46
|
let(:listen) { true }
|
@@ -45,6 +48,12 @@ shared_examples "Instrumental Agent" do
|
|
45
48
|
let(:authenticate) { true }
|
46
49
|
let(:server) { TestServer.new(:listen => listen, :authenticate => authenticate, :response => response, :secure => secure?) }
|
47
50
|
|
51
|
+
# Time Travel Options
|
52
|
+
let(:start_of_minute) do
|
53
|
+
now = Time.now.to_i
|
54
|
+
Time.at(now - (now % 60))
|
55
|
+
end
|
56
|
+
|
48
57
|
before do
|
49
58
|
Instrumental::Agent.logger.level = Logger::UNKNOWN
|
50
59
|
@server = server
|
@@ -226,16 +235,16 @@ shared_examples "Instrumental Agent" do
|
|
226
235
|
allow(agent.logger).to receive(:debug)
|
227
236
|
expect(agent.logger).to receive(:debug).with("Dropping command, queue full(3): increment overflow_test 4 300 1")
|
228
237
|
expect(agent.logger).to receive(:debug).with("Dropping command, queue full(3): increment overflow_test 5 300 1")
|
229
|
-
5
|
230
|
-
agent.increment('overflow_test', i
|
231
|
-
end
|
232
|
-
wait do
|
233
|
-
expect(server.commands).to include("increment overflow_test 1 300 1")
|
234
|
-
expect(server.commands).to include("increment overflow_test 2 300 1")
|
235
|
-
expect(server.commands).to include("increment overflow_test 3 300 1")
|
236
|
-
expect(server.commands).to_not include("increment overflow_test 4 300 1")
|
237
|
-
expect(server.commands).to_not include("increment overflow_test 5 300 1")
|
238
|
+
1.upto(5) do |i|
|
239
|
+
agent.increment('overflow_test', i, 300)
|
238
240
|
end
|
241
|
+
|
242
|
+
wait
|
243
|
+
expect(agent.sender_queue.size).to eq(3)
|
244
|
+
expect(agent.sender_queue.pop.first.to_s).to start_with("increment overflow_test 1 300 1")
|
245
|
+
expect(agent.sender_queue.pop.first.to_s).to start_with("increment overflow_test 2 300 1")
|
246
|
+
expect(agent.sender_queue.pop.first.to_s).to start_with("increment overflow_test 3 300 1")
|
247
|
+
expect(agent.sender_queue.size).to eq(0)
|
239
248
|
end
|
240
249
|
end
|
241
250
|
end
|
@@ -246,7 +255,7 @@ shared_examples "Instrumental Agent" do
|
|
246
255
|
5.times do |i|
|
247
256
|
agent.increment('overflow_test', i + 1, 300)
|
248
257
|
end
|
249
|
-
expect(agent.instance_variable_get(:@
|
258
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(0)
|
250
259
|
wait # let the server receive the commands
|
251
260
|
expect(server.commands).to include("increment overflow_test 1 300 1")
|
252
261
|
expect(server.commands).to include("increment overflow_test 2 300 1")
|
@@ -262,8 +271,10 @@ shared_examples "Instrumental Agent" do
|
|
262
271
|
fork do
|
263
272
|
agent.increment('fork_reconnect_test', 1, 3) # triggers reconnect
|
264
273
|
end
|
274
|
+
|
265
275
|
wait(1)
|
266
276
|
agent.increment('fork_reconnect_test', 1, 4) # triggers reconnect
|
277
|
+
|
267
278
|
wait(1)
|
268
279
|
expect(server.connect_count).to eq(2)
|
269
280
|
|
@@ -279,17 +290,17 @@ shared_examples "Instrumental Agent" do
|
|
279
290
|
sleep 1
|
280
291
|
}
|
281
292
|
|
282
|
-
|
283
|
-
allow(agent).to receive(:
|
284
|
-
|
293
|
+
run_sender_loop_calls = 0
|
294
|
+
allow(agent).to receive(:run_sender_loop) {
|
295
|
+
run_sender_loop_calls += 1
|
285
296
|
sleep 3 # keep the worker thread alive
|
286
297
|
}
|
287
298
|
|
288
299
|
t = Thread.new { agent.increment("race") }
|
289
300
|
agent.increment("race")
|
290
301
|
wait(2)
|
291
|
-
expect(
|
292
|
-
expect(agent.
|
302
|
+
expect(run_sender_loop_calls).to eq(1)
|
303
|
+
expect(agent.sender_queue.size).to eq(2)
|
293
304
|
end
|
294
305
|
|
295
306
|
it "should never let an exception reach the user" do
|
@@ -312,14 +323,6 @@ shared_examples "Instrumental Agent" do
|
|
312
323
|
expect(agent.increment("test")).to eq(nil)
|
313
324
|
end
|
314
325
|
|
315
|
-
it "should track invalid metrics" do
|
316
|
-
expect(agent.logger).to receive(:warn).with(/%%/)
|
317
|
-
agent.increment(' %% .!#@$%^&*', 1, 1)
|
318
|
-
wait do
|
319
|
-
expect(server.commands.join("\n")).to include("increment agent.invalid_metric")
|
320
|
-
end
|
321
|
-
end
|
322
|
-
|
323
326
|
it "should allow reasonable metric names" do
|
324
327
|
agent.increment('a')
|
325
328
|
agent.increment('a.b')
|
@@ -397,9 +400,9 @@ shared_examples "Instrumental Agent" do
|
|
397
400
|
|
398
401
|
it "should allow flushing pending values to the server" do
|
399
402
|
1.upto(100) { agent.gauge('a', rand(50)) }
|
400
|
-
expect(agent.instance_variable_get(:@
|
403
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to be > 0
|
401
404
|
agent.flush
|
402
|
-
expect(agent.instance_variable_get(:@
|
405
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(0)
|
403
406
|
wait do
|
404
407
|
expect(server.commands.grep(/^gauge a /).size).to eq(100)
|
405
408
|
end
|
@@ -437,7 +440,7 @@ shared_examples "Instrumental Agent" do
|
|
437
440
|
agent.increment('reconnect_test', 1, 1234)
|
438
441
|
wait
|
439
442
|
# The agent should not have sent the metric yet, the server is not responding
|
440
|
-
expect(agent.
|
443
|
+
expect(agent.sender_queue.pop(true).first.to_s).to eq("increment reconnect_test 1 1234 1")
|
441
444
|
end
|
442
445
|
|
443
446
|
it "should warn once when buffer is full" do
|
@@ -472,14 +475,14 @@ shared_examples "Instrumental Agent" do
|
|
472
475
|
agent.increment('reconnect_test', 1, 1234)
|
473
476
|
wait
|
474
477
|
# Since server hasn't responded to hello or authenticate, worker thread will not send data
|
475
|
-
expect(agent.
|
478
|
+
expect(agent.sender_queue.pop(true).first.to_s).to eq("increment reconnect_test 1 1234 1")
|
476
479
|
end
|
477
480
|
end
|
478
481
|
|
479
482
|
context 'server hangup' do
|
480
483
|
it "should cancel the worker thread when the host has hung up" do
|
481
484
|
# Start the background agent thread and let it send one metric successfully
|
482
|
-
agent.gauge('
|
485
|
+
agent.gauge('connection_failure1', 1, 1234)
|
483
486
|
wait do
|
484
487
|
expect(server.commands.grep(/connection_failure/).size).to eq(1)
|
485
488
|
end
|
@@ -487,13 +490,13 @@ shared_examples "Instrumental Agent" do
|
|
487
490
|
server.stop
|
488
491
|
wait
|
489
492
|
# Send one metric to the stopped server
|
490
|
-
agent.gauge('
|
493
|
+
agent.gauge('connection_failure2', 1, 1234)
|
491
494
|
# The agent thread should have stopped running since the network write would
|
492
495
|
# have failed. The queue will still contain the metric that has yet to be sent
|
493
496
|
wait do
|
494
497
|
expect(agent.send(:running?)).to eq(false)
|
495
498
|
end
|
496
|
-
expect(agent.
|
499
|
+
expect(agent.sender_queue.size).to eq(1)
|
497
500
|
end
|
498
501
|
|
499
502
|
it "should restart the worker thread after hanging it up during an unreachable host event" do
|
@@ -512,7 +515,7 @@ shared_examples "Instrumental Agent" do
|
|
512
515
|
wait do
|
513
516
|
expect(agent.send(:running?)).to eq(false)
|
514
517
|
end
|
515
|
-
expect(agent.
|
518
|
+
expect(agent.sender_queue.size).to eq(1)
|
516
519
|
# Start the server back up again
|
517
520
|
server.listen
|
518
521
|
# Sending another metric should kickstart the background worker thread
|
@@ -520,12 +523,76 @@ shared_examples "Instrumental Agent" do
|
|
520
523
|
# The agent should now be running the background thread, and the queue should be empty
|
521
524
|
wait do
|
522
525
|
expect(agent.send(:running?)).to eq(true)
|
523
|
-
expect(agent.
|
526
|
+
expect(agent.sender_queue.size).to eq(0)
|
524
527
|
end
|
525
528
|
end
|
526
529
|
|
527
|
-
|
530
|
+
it "should restart the worker thread after hanging it up during a bad ssl handshake event" do
|
531
|
+
# Start the background agent thread and let it send one metric successfully
|
532
|
+
agent.gauge('connection_failure', 1, 1234)
|
533
|
+
wait do
|
534
|
+
expect(server.commands.grep(/connection_failure/).size).to eq(1)
|
535
|
+
end
|
536
|
+
# Make the agent return the relevant exception on the next connection test
|
537
|
+
test_connection_fail = true
|
538
|
+
tc = agent.method(:test_connection)
|
539
|
+
allow(agent).to receive(:test_connection) do |*args, &block|
|
540
|
+
test_connection_fail ? raise(OpenSSL::SSL::SSLError.new) : tc.call(*args)
|
541
|
+
end
|
528
542
|
|
543
|
+
# Send one metric to the agent
|
544
|
+
agent.gauge('connection_failure', 1, 1234)
|
545
|
+
# The agent thread should have stopped running since the network write would
|
546
|
+
# have failed.
|
547
|
+
wait do
|
548
|
+
expect(agent.send(:running?)).to eq(false)
|
549
|
+
end
|
550
|
+
# The command is not in the queue
|
551
|
+
expect(agent.sender_queue.size).to eq(0)
|
552
|
+
# allow the agent to behave normally
|
553
|
+
test_connection_fail = false
|
554
|
+
# Sending another metric should kickstart the background worker thread
|
555
|
+
agent.gauge('connection_failure', 1, 1234)
|
556
|
+
# The agent should now be running the background thread, and the queue should be empty
|
557
|
+
wait do
|
558
|
+
expect(agent.send(:running?)).to eq(true)
|
559
|
+
expect(agent.sender_queue.size).to eq(0)
|
560
|
+
expect(server.commands.grep(/connection_failure/).size).to eq(2)
|
561
|
+
end
|
562
|
+
end
|
563
|
+
|
564
|
+
it "should accurately count failures so that backoff can work as intended" do
|
565
|
+
# Start the background agent thread and let it send one metric successfully
|
566
|
+
agent.gauge('connection_failure', 1, 1234)
|
567
|
+
wait do
|
568
|
+
expect(server.commands.grep(/connection_failure/).size).to eq(1)
|
569
|
+
end
|
570
|
+
|
571
|
+
# configure test_connection to fail in a way that won't kill the inner loop
|
572
|
+
test_connection_fail = true
|
573
|
+
tc = agent.method(:test_connection)
|
574
|
+
allow(agent).to receive(:test_connection) do |*args, &block|
|
575
|
+
test_connection_fail ? raise("test_connection_fail") : tc.call(*args)
|
576
|
+
end
|
577
|
+
|
578
|
+
# send some metrics
|
579
|
+
agent.gauge('connection_failure_1', 1, 1234)
|
580
|
+
agent.gauge('connection_failure_2', 1, 1234)
|
581
|
+
agent.gauge('connection_failure_3', 1, 1234)
|
582
|
+
wait do
|
583
|
+
expect(agent.instance_variable_get(:@failures)).to be > 0
|
584
|
+
expect(agent.sender_queue.size).to be > 0
|
585
|
+
end
|
586
|
+
|
587
|
+
# let the loop proceed
|
588
|
+
test_connection_fail = false
|
589
|
+
|
590
|
+
wait do
|
591
|
+
expect(agent.send(:running?)).to eq(true)
|
592
|
+
expect(agent.sender_queue.size).to eq(0)
|
593
|
+
end
|
594
|
+
end
|
595
|
+
end
|
529
596
|
|
530
597
|
context 'not authenticating' do
|
531
598
|
# Server will fail all authentication attempts
|
@@ -535,7 +602,7 @@ shared_examples "Instrumental Agent" do
|
|
535
602
|
agent.increment('reconnect_test', 1, 1234)
|
536
603
|
wait
|
537
604
|
# Metrics should not have been sent since all authentication failed
|
538
|
-
expect(agent.
|
605
|
+
expect(agent.sender_queue.pop(true).first.to_s).to eq("increment reconnect_test 1 1234 1")
|
539
606
|
end
|
540
607
|
end
|
541
608
|
|
@@ -569,20 +636,21 @@ shared_examples "Instrumental Agent" do
|
|
569
636
|
end
|
570
637
|
end
|
571
638
|
|
572
|
-
it "should
|
639
|
+
it "should follow normal exit procedures whether or not there are commands queued" do
|
573
640
|
allow(agent).to receive(:open_socket) { |*args, &block| sleep(5) && block.call }
|
574
|
-
with_constants('Instrumental::Agent::EXIT_FLUSH_TIMEOUT' =>
|
575
|
-
if (pid = fork { agent.increment('foo', 1); agent.
|
641
|
+
with_constants('Instrumental::Agent::EXIT_FLUSH_TIMEOUT' => 1) do
|
642
|
+
if (pid = fork { agent.increment('foo', 1); agent.sender_queue.clear })
|
576
643
|
tm = Time.now.to_f
|
577
644
|
Process.wait(pid)
|
578
645
|
diff = Time.now.to_f - tm
|
579
|
-
expect(diff).to be <
|
646
|
+
expect(diff).to be < 2
|
647
|
+
expect(diff).to be > 1
|
580
648
|
end
|
581
649
|
end
|
582
650
|
end
|
583
651
|
end
|
584
652
|
|
585
|
-
it "should not wait longer than EXIT_FLUSH_TIMEOUT to attempt flushing the socket when disconnecting" do
|
653
|
+
it "should not wait much longer than EXIT_FLUSH_TIMEOUT to attempt flushing the socket when disconnecting" do
|
586
654
|
agent.increment('foo', 1)
|
587
655
|
wait do
|
588
656
|
expect(server.commands.grep(/foo/).size).to eq(1)
|
@@ -598,12 +666,13 @@ shared_examples "Instrumental Agent" do
|
|
598
666
|
raise
|
599
667
|
end
|
600
668
|
end.join
|
601
|
-
end
|
669
|
+
end.at_least(1).times
|
670
|
+
|
602
671
|
with_constants('Instrumental::Agent::EXIT_FLUSH_TIMEOUT' => 3) do
|
603
672
|
tm = Time.now.to_f
|
604
673
|
agent.cleanup
|
605
674
|
diff = Time.now.to_f - tm
|
606
|
-
expect(diff).to be <= 3
|
675
|
+
expect(diff).to be <= 3.1
|
607
676
|
end
|
608
677
|
end
|
609
678
|
|
@@ -658,7 +727,7 @@ shared_examples "Instrumental Agent" do
|
|
658
727
|
expect(agent.send(:running?)).to eq(true)
|
659
728
|
|
660
729
|
# Setup a failure for the next command so we'll break out of the inner
|
661
|
-
# loop in
|
730
|
+
# loop in run_sender_loop causing another call to open_socket
|
662
731
|
test_connection_fail = true
|
663
732
|
tc = agent.method(:test_connection)
|
664
733
|
allow(agent).to receive(:test_connection) { |*args, &block| test_connection_fail ? raise("fail") : tc.call(*args) }
|
@@ -735,6 +804,330 @@ shared_examples "Instrumental Agent" do
|
|
735
804
|
end
|
736
805
|
end
|
737
806
|
end
|
807
|
+
|
808
|
+
describe Instrumental::Agent, "aggregation" do
|
809
|
+
context "aggregation enabled" do
|
810
|
+
let(:frequency) { 2 }
|
811
|
+
|
812
|
+
it "can be enabled at Agent.new time" do
|
813
|
+
expect(agent.frequency).to eq(2)
|
814
|
+
end
|
815
|
+
|
816
|
+
it "can be modified by setting the agent frequency" do
|
817
|
+
agent.frequency = 15
|
818
|
+
expect(agent.frequency).to eq(15)
|
819
|
+
end
|
820
|
+
|
821
|
+
it "is disabled by default" do
|
822
|
+
agent = Instrumental::Agent.new('test_token')
|
823
|
+
expect(agent.frequency.to_f).to eq(0)
|
824
|
+
end
|
825
|
+
|
826
|
+
it "should only allow frequencies that align with minutes" do
|
827
|
+
(-5..100).each do |freq|
|
828
|
+
agent.frequency = freq
|
829
|
+
expect(Instrumental::Agent::VALID_FREQUENCIES).to include(agent.frequency)
|
830
|
+
end
|
831
|
+
end
|
832
|
+
|
833
|
+
it "bypasses aggregator queue entirely for most commands when frequency == 0" do
|
834
|
+
agent.frequency = 0 # this is red - 0 for green
|
835
|
+
expect(EventAggregator).not_to receive(:new)
|
836
|
+
agent.increment('a_metric')
|
837
|
+
end
|
838
|
+
|
839
|
+
it "adds data to the event aggregator and does not immediately send it" do
|
840
|
+
Timecop.travel start_of_minute
|
841
|
+
agent.increment('test')
|
842
|
+
wait do
|
843
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(1)
|
844
|
+
expect(agent.instance_variable_get(:@event_aggregator).values.values.first.metric).to eq('test')
|
845
|
+
end
|
846
|
+
end
|
847
|
+
|
848
|
+
it "batches data before sending" do
|
849
|
+
Timecop.freeze do
|
850
|
+
agent.increment('a_metric')
|
851
|
+
agent.increment('a_metric')
|
852
|
+
agent.increment('another_metric')
|
853
|
+
end
|
854
|
+
agent.flush(true)
|
855
|
+
wait do
|
856
|
+
expect(server.commands.grep(/_metric/).size).to eq(2)
|
857
|
+
aggregated_metric = server.commands.grep(/a_metric/).first.split(" ")
|
858
|
+
expect(aggregated_metric[2].to_i).to eq(2) # value
|
859
|
+
expect(aggregated_metric[4].to_i).to eq(2) # count
|
860
|
+
end
|
861
|
+
end
|
862
|
+
|
863
|
+
it "aggregates to the specified frequency within the aggregator" do
|
864
|
+
Timecop.travel(start_of_minute)
|
865
|
+
agent.frequency = 15
|
866
|
+
expect(agent.frequency).not_to be(Instrumental::Agent::DEFAULT_FREQUENCY)
|
867
|
+
agent.increment('metric', 1, Time.at(0))
|
868
|
+
|
869
|
+
# will get aligned to the closest frequency (15)
|
870
|
+
agent.increment('metric', 1, Time.at(20))
|
871
|
+
wait do
|
872
|
+
expect(agent.instance_variable_get(:@event_aggregator).values.keys).to eq(["metric:0", "metric:15"])
|
873
|
+
end
|
874
|
+
agent.flush
|
875
|
+
wait do
|
876
|
+
expect(server.commands.grep(/metric 1 0/).size).to eq(1)
|
877
|
+
expect(server.commands.grep(/metric 1 15/).size).to eq(1)
|
878
|
+
end
|
879
|
+
end
|
880
|
+
|
881
|
+
it "flushes data from both queues before sending" do
|
882
|
+
Timecop.freeze do
|
883
|
+
100.times do |i|
|
884
|
+
agent.increment("test_metric_#{i}")
|
885
|
+
agent.increment("other_metric")
|
886
|
+
end
|
887
|
+
end
|
888
|
+
|
889
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to be > 0
|
890
|
+
agent.flush
|
891
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(0)
|
892
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(0)
|
893
|
+
|
894
|
+
wait do
|
895
|
+
expect(server.commands.grep(/test_metric/).size).to eq(100)
|
896
|
+
expect(server.commands.grep(/other_metric/).size).to eq(1)
|
897
|
+
end
|
898
|
+
end
|
899
|
+
|
900
|
+
it "does not batch notices" do
|
901
|
+
agent.frequency = 60
|
902
|
+
agent.notice "things are happening", 0, 100
|
903
|
+
agent.notice "things are happening", 0, 100
|
904
|
+
agent.notice "things are happening", 0, 100
|
905
|
+
wait do
|
906
|
+
expect(server.commands.grep(/things are happening/).size).to eq(3)
|
907
|
+
end
|
908
|
+
end
|
909
|
+
|
910
|
+
it "can be disabled by setting frequency to nil" do
|
911
|
+
agent.frequency = nil
|
912
|
+
expect(EventAggregator).not_to receive(:new)
|
913
|
+
agent.increment('metric')
|
914
|
+
wait do
|
915
|
+
expect(server.commands.grep(/metric/).size).to eq(1)
|
916
|
+
end
|
917
|
+
end
|
918
|
+
|
919
|
+
it "can be disabled by setting frequency to 0" do
|
920
|
+
agent.frequency = 0
|
921
|
+
expect(EventAggregator).not_to receive(:new)
|
922
|
+
agent.increment('metric')
|
923
|
+
wait do
|
924
|
+
expect(server.commands.grep(/metric/).size).to eq(1)
|
925
|
+
end
|
926
|
+
end
|
927
|
+
|
928
|
+
it "automatically uses the highest-without-going-over frequency for a bad frequency" do
|
929
|
+
agent.frequency = 17
|
930
|
+
expect(agent.frequency).to eq(15)
|
931
|
+
agent.frequency = 69420
|
932
|
+
expect(agent.frequency).to eq(60)
|
933
|
+
agent.frequency = 0
|
934
|
+
expect(agent.frequency).to eq(0)
|
935
|
+
agent.frequency = -1
|
936
|
+
expect(agent.frequency).to eq(0)
|
937
|
+
end
|
938
|
+
|
939
|
+
it "can take strings as frequency" do
|
940
|
+
agent = Instrumental::Agent.new('test_token', :frequency => "15")
|
941
|
+
expect(agent.frequency).to eq(15)
|
942
|
+
end
|
943
|
+
|
944
|
+
it "should not be enabled at the same time as synchronous" do
|
945
|
+
expect(Instrumental::Agent.logger).to receive(:warn).with(/Synchronous and Frequency should not be enabled at the same time! Defaulting to synchronous mode./)
|
946
|
+
agent = Instrumental::Agent.new('test_token', :synchronous => true, :frequency => 6)
|
947
|
+
expect(agent.synchronous).to eq(true)
|
948
|
+
expect(agent.frequency).to eq(0)
|
949
|
+
end
|
950
|
+
|
951
|
+
it "should use synchronous mode if it is enabled, even if turned on after frequency set at start" do
|
952
|
+
agent.increment('metric')
|
953
|
+
agent.increment('metric')
|
954
|
+
agent.synchronous = true
|
955
|
+
agent.increment('metric')
|
956
|
+
wait do
|
957
|
+
expect(server.commands.grep(/metric 1/).size).to eq(1)
|
958
|
+
end
|
959
|
+
agent.flush
|
960
|
+
wait do
|
961
|
+
expect(server.commands.grep(/metric 1/).size).to eq(1)
|
962
|
+
expect(server.commands.grep(/metric 2/).size).to eq(1)
|
963
|
+
end
|
964
|
+
end
|
965
|
+
|
966
|
+
it "sends aggregated metrics after specified frequency, even if no flush is sent" do
|
967
|
+
agent.frequency = 1
|
968
|
+
Timecop.travel(start_of_minute)
|
969
|
+
agent.increment('metric')
|
970
|
+
agent.increment('metric')
|
971
|
+
agent.gauge('other', 1)
|
972
|
+
agent.gauge('other', 1)
|
973
|
+
agent.gauge('other', 1)
|
974
|
+
sleep (0.5)
|
975
|
+
wait { expect(server.commands.grep(/metric/).size).to eq(0) }
|
976
|
+
sleep (0.51) # total sleep > 1 frequency
|
977
|
+
|
978
|
+
expect(server.commands.grep(/metric 2/).size).to eq(1)
|
979
|
+
expect(server.commands.grep(/other 3/).size).to eq(1)
|
980
|
+
end
|
981
|
+
|
982
|
+
# this test really relies on the worker threads not working unexpectedly
|
983
|
+
it "will overflow if the aggregator queue is full" do
|
984
|
+
Timecop.travel(start_of_minute)
|
985
|
+
with_constants('Instrumental::Agent::MAX_BUFFER' => 3) do
|
986
|
+
allow(agent.logger).to receive(:debug)
|
987
|
+
expect(agent.logger).to receive(:debug).with("Dropping command, queue full(3): increment overflow_test 4 300 1")
|
988
|
+
agent.increment('overflow_test', 4, 300, 1)
|
989
|
+
agent.increment('overflow_test', 4, 300, 1)
|
990
|
+
agent.increment('overflow_test', 4, 300, 1)
|
991
|
+
agent.increment('overflow_test', 4, 300, 1)
|
992
|
+
|
993
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(3)
|
994
|
+
agent.flush
|
995
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(0)
|
996
|
+
end
|
997
|
+
end
|
998
|
+
|
999
|
+
it "if aggregator is at max size, next command will force a forward to the sender thread" do
|
1000
|
+
Timecop.travel(start_of_minute)
|
1001
|
+
with_constants('Instrumental::Agent::MAX_AGGREGATOR_SIZE' => 3) do
|
1002
|
+
agent.increment('overflow_test1')
|
1003
|
+
agent.increment('overflow_test2')
|
1004
|
+
agent.increment('overflow_test3')
|
1005
|
+
agent.increment('overflow_test4')
|
1006
|
+
agent.increment('overflow_test5')
|
1007
|
+
|
1008
|
+
# only 1 because the 5th command triggers a forward of the first 4
|
1009
|
+
wait do
|
1010
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(1)
|
1011
|
+
end
|
1012
|
+
agent.flush
|
1013
|
+
wait do
|
1014
|
+
expect(server.commands.grep(/overflow_test/).size).to eq(5)
|
1015
|
+
end
|
1016
|
+
end
|
1017
|
+
end
|
1018
|
+
|
1019
|
+
context do
|
1020
|
+
let(:listen) { false }
|
1021
|
+
it "will not send aggregators to the sender queue if the sender thread is not ready" do
|
1022
|
+
Timecop.travel(start_of_minute)
|
1023
|
+
agent.frequency = 1
|
1024
|
+
|
1025
|
+
with_constants('Instrumental::Agent::MAX_BUFFER' => 3,
|
1026
|
+
'Instrumental::Agent::MAX_AGGREGATOR_SIZE' => 4) do
|
1027
|
+
|
1028
|
+
# fill the queue
|
1029
|
+
agent.increment('overflow_test1')
|
1030
|
+
agent.increment('overflow_test2')
|
1031
|
+
agent.increment('overflow_test3')
|
1032
|
+
|
1033
|
+
# wait until they are all in the aggregator
|
1034
|
+
wait do
|
1035
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(0)
|
1036
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(3)
|
1037
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(0)
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
# fill the queue again
|
1041
|
+
agent.increment('overflow_test1')
|
1042
|
+
agent.increment('overflow_test2')
|
1043
|
+
agent.increment('overflow_test3')
|
1044
|
+
|
1045
|
+
# wait until they are all in the aggregator
|
1046
|
+
wait do
|
1047
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(0)
|
1048
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(3)
|
1049
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(0)
|
1050
|
+
end
|
1051
|
+
|
1052
|
+
# wait for the aggregator to get forwarded and popped by the sender
|
1053
|
+
wait do
|
1054
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(0)
|
1055
|
+
expect(agent.instance_variable_get(:@event_aggregator)).to eq(nil)
|
1056
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(1)
|
1057
|
+
end
|
1058
|
+
|
1059
|
+
# fill the queue again
|
1060
|
+
agent.increment('overflow_test4')
|
1061
|
+
agent.increment('overflow_test5')
|
1062
|
+
agent.increment('overflow_test6')
|
1063
|
+
|
1064
|
+
# wait for them all to be in the aggregator
|
1065
|
+
wait do
|
1066
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(0)
|
1067
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(3)
|
1068
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(1)
|
1069
|
+
end
|
1070
|
+
|
1071
|
+
# sleep until the next forward is done
|
1072
|
+
sleep(agent.frequency + 0.1)
|
1073
|
+
|
1074
|
+
# fill the queue again
|
1075
|
+
agent.increment('overflow_test7')
|
1076
|
+
agent.increment('overflow_test8')
|
1077
|
+
agent.increment('overflow_test9')
|
1078
|
+
|
1079
|
+
# because sending is blocked, the prevous aggregator never sent
|
1080
|
+
# when it hits max size, the aggregator queue starts backing up
|
1081
|
+
wait do
|
1082
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(1)
|
1083
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(5)
|
1084
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(1)
|
1085
|
+
end
|
1086
|
+
|
1087
|
+
# send 3 more items, to overflow the aggregator queue
|
1088
|
+
allow(agent.logger).to receive(:debug)
|
1089
|
+
expect(agent.logger).to receive(:debug).with("Dropping command, queue full(3): increment overflow_testc 4 300 1")
|
1090
|
+
agent.increment('overflow_testa')
|
1091
|
+
agent.increment('overflow_testb')
|
1092
|
+
agent.increment('overflow_testc', 4, 300, 1) # will get dropped
|
1093
|
+
|
1094
|
+
wait do
|
1095
|
+
expect(agent.instance_variable_get(:@aggregator_queue).size).to eq(3)
|
1096
|
+
expect(agent.instance_variable_get(:@event_aggregator).size).to eq(5)
|
1097
|
+
expect(agent.instance_variable_get(:@sender_queue).size).to eq(1)
|
1098
|
+
end
|
1099
|
+
end
|
1100
|
+
end
|
1101
|
+
end
|
1102
|
+
|
1103
|
+
if FORK_SUPPORTED
|
1104
|
+
it "should automatically reconnect when forked when aggregation is enabled" do
|
1105
|
+
Timecop.travel start_of_minute
|
1106
|
+
agent.frequency = 10
|
1107
|
+
|
1108
|
+
agent.increment('fork_reconnect_test1', 1, 0, 1)
|
1109
|
+
fork do
|
1110
|
+
agent.increment('fork_reconnect_test2', 1, 0, 1) # triggers reconnect
|
1111
|
+
exit
|
1112
|
+
end
|
1113
|
+
|
1114
|
+
|
1115
|
+
sleep 1
|
1116
|
+
agent.increment('fork_reconnect_test3', 1, 0, 1) # triggers reconnect
|
1117
|
+
|
1118
|
+
agent.flush
|
1119
|
+
expect(server.connect_count).to eq(2)
|
1120
|
+
|
1121
|
+
wait do
|
1122
|
+
expect(server.commands).to include("increment fork_reconnect_test1 1 0 1")
|
1123
|
+
expect(server.commands).to include("increment fork_reconnect_test2 1 0 1")
|
1124
|
+
expect(server.commands).to include("increment fork_reconnect_test3 1 0 1")
|
1125
|
+
expect(server.commands.grep(/fork_reconnect/).size).to eq(3)
|
1126
|
+
end
|
1127
|
+
end
|
1128
|
+
end
|
1129
|
+
end
|
1130
|
+
end
|
738
1131
|
end
|
739
1132
|
end
|
740
1133
|
|