instrumental_agent 1.0.1 → 3.0.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.ruby-version +1 -0
- data/.travis.yml +4 -6
- data/CHANGELOG.md +21 -0
- data/Gemfile +3 -5
- data/README.md +40 -9
- data/instrumental_agent.gemspec +6 -2
- data/lib/instrumental/agent.rb +262 -155
- data/lib/instrumental/capistrano.rb +4 -46
- data/lib/instrumental/capistrano/capistrano2.rb +47 -0
- data/lib/instrumental/capistrano/capistrano3.rake +56 -0
- data/lib/instrumental/command_structs.rb +32 -0
- data/lib/instrumental/event_aggregator.rb +26 -0
- data/lib/instrumental/version.rb +1 -1
- data/script/setup +22 -4
- data/script/test +34 -2
- data/spec/agent_spec.rb +579 -109
- data/spec/command_struct_specs.rb +20 -0
- data/spec/event_aggregator_spec.rb +53 -0
- data/spec/spec_helper.rb +8 -1
- data/spec/test_server.rb +13 -6
- metadata +47 -17
- data/certs/equifax.ca.pem +0 -69
- data/certs/geotrust.ca.pem +0 -80
- data/certs/rapidssl.ca.pem +0 -94
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a1a781a0abec98e8c737a57fac5134fe2ee847879949fc5134ced2c0b0136076
|
4
|
+
data.tar.gz: 89950c6e0b59713dc6034be99acfc52473ef049b63824685694626d049a85b8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca82be6040189400c0e01513d517202e8d27d900ad2f10e5558025cad6a499e49985ee58c95efdf90d24e2fd0174b9349c7cd6ac8f906325f2de766c0f14cba3
|
7
|
+
data.tar.gz: 742200da53f676a9341a795f5f8145416650c033466782348fc4852dc62e656e9fed4e2be02c0829c2ec946092e7dfcd7d836cc50e731a6ceb8cb2493b76a1b6
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.6.3
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,24 @@
|
|
1
|
+
### 3.0.0.beta [October 8, 2020]
|
2
|
+
* Drop support for outdated versions of Ruby
|
3
|
+
* Explicitly add support for new versions of Ruby
|
4
|
+
* Add support for client-side aggregation
|
5
|
+
* Note: the agent API has NOT changed. This is a major release because of the significant changes in Ruby versions officially supported.
|
6
|
+
|
7
|
+
### 3.0.0.alpha [August 22, 2019]
|
8
|
+
* Drop support for outdated versions of Ruby
|
9
|
+
* Explicitly add support for new versions of Ruby
|
10
|
+
* Better handling of SSL errors when connecting to Instrumental
|
11
|
+
* Note: the agent API has NOT changed. This is a major release because of the significant changes in Ruby versions officially supported.
|
12
|
+
|
13
|
+
### 2.1.0 [January 19, 2018]
|
14
|
+
* Add support for capistrano 3
|
15
|
+
|
16
|
+
### 2.0.0 [August 21, 2017]
|
17
|
+
* Add automatic tracking of common application metrics, official release
|
18
|
+
|
19
|
+
### 2.0.0.alpha [August 18, 2017]
|
20
|
+
* Add automatic tracking of common application metrics
|
21
|
+
|
1
22
|
### 1.0.1 [July 12, 2016]
|
2
23
|
* Make agent initialization threadsafe
|
3
24
|
|
data/Gemfile
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
source "https://rubygems.org"
|
2
2
|
|
3
3
|
gemspec
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
gem 'system_timer', '~> 1.2'
|
8
|
-
end
|
4
|
+
|
5
|
+
# fixes 2.3.0 ffi bundle error
|
6
|
+
gem 'ffi', '~> 1.0.11'
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Instrumental is a [application monitoring platform](https://instrumentalapp.com) built for developers who want a better understanding of their production software. Powerful tools, like the [Instrumental Query Language](https://instrumentalapp.com/docs/query-language), combined with an exploration-focused interface allow you to get real answers to complex questions, in real-time.
|
4
4
|
|
5
|
-
This agent supports custom metric monitoring for Ruby applications. It provides high-data reliability at high scale, without ever blocking your process or causing an exception.
|
5
|
+
This agent supports custom metric monitoring for Ruby applications. It provides high-data reliability at high scale, without ever blocking your process or causing an exception.
|
6
6
|
|
7
7
|
## Setup & Usage
|
8
8
|
|
@@ -12,10 +12,10 @@ Add the gem to your Gemfile.
|
|
12
12
|
gem 'instrumental_agent'
|
13
13
|
```
|
14
14
|
|
15
|
-
Visit [instrumentalapp.com](https://instrumentalapp.com) and create an account, then initialize the agent with your API
|
15
|
+
Visit [instrumentalapp.com](https://instrumentalapp.com) and create an account, then initialize the agent with your [project API token](https://instrumentalapp.com/docs/tokens).
|
16
16
|
|
17
17
|
```ruby
|
18
|
-
I = Instrumental::Agent.new('
|
18
|
+
I = Instrumental::Agent.new('PROJECT_API_TOKEN', :enabled => Rails.env.production?)
|
19
19
|
```
|
20
20
|
|
21
21
|
You'll probably want something like the above, only enabling the agent in production mode so you don't have development and production data writing to the same value. Or you can setup two projects, so that you can verify stats in one, and release them to production in another.
|
@@ -59,15 +59,21 @@ User.find_each do |user|
|
|
59
59
|
end
|
60
60
|
```
|
61
61
|
|
62
|
-
##
|
62
|
+
## Aggregation
|
63
|
+
Aggregation collects more data on your system before sending it to Instrumental. This reduces the total amount of data being sent, at the cost of a small amount of additional latency. You can control this feature with the frequency parameter:
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
gem install instrumental_tools
|
68
|
-
instrument_server
|
65
|
+
```ruby
|
66
|
+
I = Instrumental::Agent.new('PROJECT_API_TOKEN', :frequency => 15) # send data every 15 seconds
|
67
|
+
I.frequency = 6 # send batches of data every 6 seconds
|
69
68
|
```
|
70
69
|
|
70
|
+
The agent may send data more frequently if you are sending a large number of different metrics. Values between 3 and 15 are generally reasonable. If you want to disable this behavior and send every metric as fast as possible, set frequency to zero or nil. Note that a frequency of zero will still use a seperate thread for performance - it is NOT the same as synchronous mode.
|
71
|
+
|
72
|
+
|
73
|
+
## Server Metrics
|
74
|
+
|
75
|
+
Want server stats like load, memory, etc.? Check out [InstrumentalD](https://github.com/instrumental/instrumentald).
|
76
|
+
|
71
77
|
## Agent Control
|
72
78
|
|
73
79
|
Need to quickly disable the agent? set :enabled to false on initialization and you don't need to change any application code.
|
@@ -99,6 +105,31 @@ If you plan on tracking metrics in Resque jobs, you will need to explicitly clea
|
|
99
105
|
|
100
106
|
You're required to do this because Resque calls `exit!` when a worker has finished processing, which bypasses Ruby's `at_exit` hooks. The Instrumental Agent installs an `at_exit` hook to flush any pending metrics to the servers, but this hook is bypassed by the `exit!` call; any other code you rely that uses `exit!` should call `I.cleanup` to ensure any pending metrics are correctly sent to the server before exiting the process.
|
101
107
|
|
108
|
+
## Automated Metric Collection
|
109
|
+
|
110
|
+
v2.x+ of the Instrumental Agent introduced automated metric collection for your application by way of the [Metrician gem](https://github.com/Instrumental/metrician-ruby). You can read more about the metrics it collects in the [Instrumental documentation](https://instrumentalapp.com/docs/metrician/installation).
|
111
|
+
|
112
|
+
### Upgrading from 1.x
|
113
|
+
|
114
|
+
If you are upgrading from the pre-2.x version of instrumental and **do not** want automated metric collection, you can disable it by setting the following in your agent setup:
|
115
|
+
|
116
|
+
```
|
117
|
+
I = Instrumental::Agent.new('PROJECT_API_TOKEN',
|
118
|
+
:enabled => Rails.env.production?,
|
119
|
+
:metrician => false
|
120
|
+
)
|
121
|
+
```
|
122
|
+
|
123
|
+
### Upgrading from 2.x
|
124
|
+
|
125
|
+
Agent version 3.x drops support for some older rubies, but should otherwise be a drop-in replacement. If you wish to enable Aggregation, enable the agent with the frequency option set to the number of seconds you would like to wait between flushes. For example:
|
126
|
+
|
127
|
+
```
|
128
|
+
I = Instrumental::Agent.new('PROJECT_API_TOKEN',
|
129
|
+
:enabled => Rails.env.production?,
|
130
|
+
:frequency => 15
|
131
|
+
)
|
132
|
+
```
|
102
133
|
|
103
134
|
## Troubleshooting & Help
|
104
135
|
|
data/instrumental_agent.gemspec
CHANGED
@@ -4,20 +4,24 @@ require "instrumental/version"
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "instrumental_agent"
|
6
6
|
s.version = Instrumental::VERSION
|
7
|
-
s.authors = ["
|
7
|
+
s.authors = ["Expected Behavior"]
|
8
8
|
s.email = ["support@instrumentalapp.com"]
|
9
9
|
s.homepage = "http://github.com/instrumental/instrumental_agent-ruby"
|
10
10
|
s.summary = %q{Custom metric monitoring for Ruby applications via Instrumental}
|
11
11
|
s.description = %q{This agent supports Instrumental custom metric monitoring for Ruby applications. It provides high-data reliability at high scale, without ever blocking your process or causing an exception.}
|
12
12
|
s.license = "MIT"
|
13
|
-
|
13
|
+
s.required_ruby_version = '>= 2.5.7'
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
16
16
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
18
|
s.require_paths = ["lib"]
|
19
|
+
|
20
|
+
s.add_runtime_dependency("metrician", [">= 0"])
|
21
|
+
|
19
22
|
s.add_development_dependency("pry", [">= 0"])
|
20
23
|
s.add_development_dependency("rake", [">= 0"])
|
21
24
|
s.add_development_dependency("rspec", ["~> 3.0"])
|
22
25
|
s.add_development_dependency("fuubar", [">= 0"])
|
26
|
+
s.add_development_dependency("timecop", [">= 0"])
|
23
27
|
end
|
data/lib/instrumental/agent.rb
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
require 'instrumental/version'
|
2
2
|
require 'instrumental/system_timer'
|
3
|
+
require 'instrumental/command_structs'
|
4
|
+
require 'instrumental/event_aggregator'
|
3
5
|
require 'logger'
|
4
6
|
require 'openssl' rescue nil
|
5
7
|
require 'resolv'
|
6
8
|
require 'thread'
|
7
9
|
require 'socket'
|
10
|
+
require 'metrician'
|
8
11
|
|
9
12
|
|
10
13
|
module Instrumental
|
@@ -14,14 +17,17 @@ module Instrumental
|
|
14
17
|
EXIT_FLUSH_TIMEOUT = 5
|
15
18
|
HOSTNAME = Socket.gethostbyname(Socket.gethostname).first rescue Socket.gethostname
|
16
19
|
MAX_BUFFER = 5000
|
20
|
+
MAX_AGGREGATOR_SIZE = 5000
|
17
21
|
MAX_RECONNECT_DELAY = 15
|
18
22
|
REPLY_TIMEOUT = 10
|
19
23
|
RESOLUTION_FAILURES_BEFORE_WAITING = 3
|
20
24
|
RESOLUTION_WAIT = 30
|
21
25
|
RESOLVE_TIMEOUT = 1
|
26
|
+
DEFAULT_FREQUENCY = 0
|
27
|
+
VALID_FREQUENCIES = [0, 1, 2, 3, 4, 5, 6, 10, 12, 15, 20, 30, 60]
|
22
28
|
|
23
29
|
|
24
|
-
attr_accessor :host, :port, :synchronous, :
|
30
|
+
attr_accessor :host, :port, :synchronous, :frequency, :sender_queue, :aggregator_queue, :dns_resolutions, :last_connect_at
|
25
31
|
attr_reader :connection, :enabled, :secure
|
26
32
|
|
27
33
|
def self.logger=(l)
|
@@ -51,6 +57,7 @@ module Instrumental
|
|
51
57
|
# port: 8001
|
52
58
|
# enabled: true
|
53
59
|
# synchronous: false
|
60
|
+
# frequency: 10
|
54
61
|
# secure: true
|
55
62
|
# verify: true
|
56
63
|
@api_key = api_key
|
@@ -72,15 +79,29 @@ module Instrumental
|
|
72
79
|
@port = (@port || default_port).to_i
|
73
80
|
@enabled = options.has_key?(:enabled) ? !!options[:enabled] : true
|
74
81
|
@synchronous = !!options[:synchronous]
|
82
|
+
|
83
|
+
if options.has_key?(:frequency)
|
84
|
+
self.frequency = options[:frequency]
|
85
|
+
else
|
86
|
+
self.frequency = DEFAULT_FREQUENCY
|
87
|
+
end
|
88
|
+
|
89
|
+
@metrician = options[:metrician].nil? ? true : !!options[:metrician]
|
75
90
|
@pid = Process.pid
|
76
91
|
@allow_reconnect = true
|
77
|
-
@certs = certificates
|
78
92
|
@dns_resolutions = 0
|
79
93
|
@last_connect_at = 0
|
94
|
+
|
80
95
|
@start_worker_mutex = Mutex.new
|
81
|
-
@
|
96
|
+
@aggregator_queue = Queue.new
|
97
|
+
@sender_queue = Queue.new
|
98
|
+
|
82
99
|
|
83
100
|
setup_cleanup_at_exit if @enabled
|
101
|
+
|
102
|
+
if @metrician
|
103
|
+
Metrician.activate(self)
|
104
|
+
end
|
84
105
|
end
|
85
106
|
|
86
107
|
# Store a gauge for a metric, optionally at a specific time.
|
@@ -88,7 +109,9 @@ module Instrumental
|
|
88
109
|
# agent.gauge('load', 1.23)
|
89
110
|
def gauge(metric, value, time = Time.now, count = 1)
|
90
111
|
if valid?(metric, value, time, count) &&
|
91
|
-
|
112
|
+
send_command(Instrumental::Command.new("gauge".freeze, metric, value, time, count))
|
113
|
+
# tempted to "gauge" this to a symbol? Don't. Frozen strings are very fast,
|
114
|
+
# and later we're going to to_s every one of these anyway.
|
92
115
|
value
|
93
116
|
else
|
94
117
|
nil
|
@@ -136,7 +159,7 @@ module Instrumental
|
|
136
159
|
# agent.increment('users')
|
137
160
|
def increment(metric, value = 1, time = Time.now, count = 1)
|
138
161
|
if valid?(metric, value, time, count) &&
|
139
|
-
|
162
|
+
send_command(Instrumental::Command.new("increment".freeze, metric, value, time, count))
|
140
163
|
value
|
141
164
|
else
|
142
165
|
nil
|
@@ -151,7 +174,7 @@ module Instrumental
|
|
151
174
|
# agent.notice('A notice')
|
152
175
|
def notice(note, time = Time.now, duration = 0)
|
153
176
|
if valid_note?(note)
|
154
|
-
send_command(
|
177
|
+
send_command(Instrumental::Notice.new(note, time, duration))
|
155
178
|
note
|
156
179
|
else
|
157
180
|
nil
|
@@ -190,6 +213,22 @@ module Instrumental
|
|
190
213
|
@logger || self.class.logger
|
191
214
|
end
|
192
215
|
|
216
|
+
def frequency=(frequency)
|
217
|
+
freq = frequency.to_i
|
218
|
+
if !VALID_FREQUENCIES.include?(freq)
|
219
|
+
logger.warn "Frequency must be a value that divides evenly into 60: 1, 2, 3, 4, 5, 6, 10, 12, 15, 20, 30, or 60."
|
220
|
+
# this will make all negative numbers and nils into 0s
|
221
|
+
freq = VALID_FREQUENCIES.select{ |f| f < freq }.max.to_i
|
222
|
+
end
|
223
|
+
|
224
|
+
@frequency = if(@synchronous)
|
225
|
+
logger.warn "Synchronous and Frequency should not be enabled at the same time! Defaulting to synchronous mode."
|
226
|
+
0
|
227
|
+
else
|
228
|
+
freq
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
193
232
|
# Stopping the agent will immediately stop all communication
|
194
233
|
# to Instrumental. If you call this and submit another metric,
|
195
234
|
# the agent will start again.
|
@@ -201,12 +240,19 @@ module Instrumental
|
|
201
240
|
#
|
202
241
|
def stop
|
203
242
|
disconnect
|
204
|
-
if @
|
205
|
-
@
|
206
|
-
@
|
243
|
+
if @sender_thread
|
244
|
+
@sender_thread.kill
|
245
|
+
@sender_thread = nil
|
246
|
+
end
|
247
|
+
if @aggregator_thread
|
248
|
+
@aggregator_thread.kill
|
249
|
+
@aggregator_thread = nil
|
250
|
+
end
|
251
|
+
if @sender_queue
|
252
|
+
@sender_queue.clear
|
207
253
|
end
|
208
|
-
if @
|
209
|
-
@
|
254
|
+
if @aggregator_queue
|
255
|
+
@aggregator_queue.clear
|
210
256
|
end
|
211
257
|
end
|
212
258
|
|
@@ -216,15 +262,22 @@ module Instrumental
|
|
216
262
|
# where at_exit is bypassed like Resque workers.
|
217
263
|
def cleanup
|
218
264
|
if running?
|
219
|
-
logger.info "Cleaning up agent,
|
265
|
+
logger.info "Cleaning up agent, aggregator_size: #{@aggregator_queue.size}, thread_running: #{@aggregator_thread.alive?}"
|
266
|
+
logger.info "Cleaning up agent, queue size: #{@sender_queue.size}, thread running: #{@sender_thread.alive?}"
|
220
267
|
@allow_reconnect = false
|
221
|
-
if @
|
222
|
-
|
268
|
+
if @sender_queue.size > 0 || @aggregator_queue.size > 0
|
269
|
+
@sender_queue << ['exit']
|
270
|
+
@aggregator_queue << ['exit']
|
223
271
|
begin
|
224
|
-
with_timeout(EXIT_FLUSH_TIMEOUT) { @
|
272
|
+
with_timeout(EXIT_FLUSH_TIMEOUT) { @aggregator_thread.join }
|
273
|
+
with_timeout(EXIT_FLUSH_TIMEOUT) { @sender_thread.join }
|
225
274
|
rescue Timeout::Error
|
226
|
-
|
227
|
-
|
275
|
+
total_size = @sender_queue&.size.to_i +
|
276
|
+
@aggregator_queue&.size.to_i +
|
277
|
+
@event_aggregator&.size.to_i
|
278
|
+
|
279
|
+
if total_size > 0
|
280
|
+
logger.error "Timed out working agent thread on exit, dropping #{total_size} metrics"
|
228
281
|
else
|
229
282
|
logger.error "Timed out Instrumental Agent, exiting"
|
230
283
|
end
|
@@ -265,7 +318,8 @@ module Instrumental
|
|
265
318
|
end
|
266
319
|
|
267
320
|
def report_exception(e)
|
268
|
-
|
321
|
+
# puts "--- Exception of type #{e.class} occurred:\n#{e.message}\n#{e.backtrace.join("\n")}"
|
322
|
+
logger.error "Exception of type #{e.class} occurred:\n#{e.message}\n#{e.backtrace.join("\n")}"
|
269
323
|
end
|
270
324
|
|
271
325
|
def ipv4_address_for_host(host, port, moment_to_connect = Time.now.to_i)
|
@@ -285,44 +339,41 @@ module Instrumental
|
|
285
339
|
nil
|
286
340
|
end
|
287
341
|
|
288
|
-
def send_command(
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
queue_message(cmd, { :synchronous => @synchronous })
|
297
|
-
else
|
298
|
-
if !@queue_full_warning
|
299
|
-
@queue_full_warning = true
|
300
|
-
logger.warn "Queue full(#{@queue.size}), dropping commands..."
|
301
|
-
end
|
302
|
-
logger.debug "Dropping command, queue full(#{@queue.size}): #{cmd.chomp}"
|
303
|
-
nil
|
304
|
-
end
|
342
|
+
def send_command(command)
|
343
|
+
return logger.debug(command.to_s) unless enabled?
|
344
|
+
start_workers
|
345
|
+
critical_queue = frequency.to_i == 0 ? @sender_queue : @aggregator_queue
|
346
|
+
if critical_queue && critical_queue.size < MAX_BUFFER
|
347
|
+
@queue_full_warning = false
|
348
|
+
logger.debug "Queueing: #{command.to_s}"
|
349
|
+
queue_message(command, { :synchronous => @synchronous })
|
305
350
|
else
|
306
|
-
|
351
|
+
if !@queue_full_warning
|
352
|
+
@queue_full_warning = true
|
353
|
+
logger.warn "Queue full(#{critical_queue.size}), dropping commands..."
|
354
|
+
end
|
355
|
+
logger.debug "Dropping command, queue full(#{critical_queue.size}): #{command.to_s}"
|
356
|
+
nil
|
307
357
|
end
|
308
358
|
end
|
309
359
|
|
310
360
|
def queue_message(message, options = {})
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
@
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
361
|
+
return message unless enabled?
|
362
|
+
|
363
|
+
# imagine it's a reverse merge, but with fewer allocations
|
364
|
+
options[:allow_reconnect] = @allow_reconnect unless options.has_key?(:allow_reconnect)
|
365
|
+
|
366
|
+
if options.delete(:synchronous)
|
367
|
+
options[:sync_resource] ||= ConditionVariable.new
|
368
|
+
@sync_mutex.synchronize {
|
369
|
+
queue = message == "flush" ? @aggregator_queue : @sender_queue
|
370
|
+
queue << [message, options]
|
371
|
+
options[:sync_resource].wait(@sync_mutex)
|
372
|
+
}
|
373
|
+
elsif frequency.to_i == 0
|
374
|
+
@sender_queue << [message, options]
|
375
|
+
else
|
376
|
+
@aggregator_queue << [message, options]
|
326
377
|
end
|
327
378
|
message
|
328
379
|
end
|
@@ -344,31 +395,15 @@ module Instrumental
|
|
344
395
|
|
345
396
|
def test_connection
|
346
397
|
begin
|
347
|
-
|
348
|
-
# on Ruby 1.8.6, 1.8.7 or 1.9.1, read_nonblock does not exist,
|
349
|
-
# and so the case of testing socket liveliness via a nonblocking
|
350
|
-
# read that catches a wait condition won't work.
|
351
|
-
#
|
352
|
-
# We grab the SSL socket's underlying IO object and perform the
|
353
|
-
# non blocking read there in order to ensure the socket is still
|
354
|
-
# valid
|
355
|
-
if @socket.respond_to?(:read_nonblock)
|
356
|
-
@socket.read_nonblock(1)
|
357
|
-
elsif @socket.respond_to?(:io)
|
358
|
-
# The SSL Socket may send down additional data at close time,
|
359
|
-
# so we perform two nonblocking reads, one to pull any pending
|
360
|
-
# data on the socket, and the second to actually perform the connection
|
361
|
-
# liveliness test
|
362
|
-
@socket.io.read_nonblock(1024) && @socket.io.read_nonblock(1024)
|
363
|
-
end
|
398
|
+
@socket.read_nonblock(1)
|
364
399
|
rescue *wait_exceptions
|
365
400
|
# noop
|
366
401
|
end
|
367
402
|
end
|
368
403
|
|
369
|
-
def
|
404
|
+
def start_workers
|
370
405
|
# NOTE: We need a mutex around both `running?` and thread creation,
|
371
|
-
# otherwise we could create
|
406
|
+
# otherwise we could create too many threads.
|
372
407
|
# Return early and queue the message if another thread is
|
373
408
|
# starting the worker.
|
374
409
|
return if !@start_worker_mutex.try_lock
|
@@ -382,9 +417,19 @@ module Instrumental
|
|
382
417
|
@sync_mutex = Mutex.new
|
383
418
|
@failures = 0
|
384
419
|
@sockaddr_in = Socket.pack_sockaddr_in(@port, address)
|
385
|
-
|
386
|
-
|
387
|
-
|
420
|
+
|
421
|
+
logger.info "Starting aggregator thread"
|
422
|
+
if !@aggregator_thread&.alive?
|
423
|
+
@aggregator_thread = Thread.new do
|
424
|
+
run_aggregator_loop
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
if !@sender_thread&.alive?
|
429
|
+
logger.info "Starting sender thread"
|
430
|
+
@sender_thread = Thread.new do
|
431
|
+
run_sender_loop
|
432
|
+
end
|
388
433
|
end
|
389
434
|
end
|
390
435
|
ensure
|
@@ -420,81 +465,152 @@ module Instrumental
|
|
420
465
|
sock
|
421
466
|
end
|
422
467
|
|
423
|
-
def
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
468
|
+
def run_aggregator_loop
|
469
|
+
# if the sender queue is some level of full, should we keep aggregating until it empties out?
|
470
|
+
# what does this mean for aggregation slices - aggregating to nearest frequency will
|
471
|
+
# make the object needlessly larger, when minute resolution is what we have on the server
|
472
|
+
begin
|
473
|
+
loop do
|
474
|
+
now = Time.now.to_i
|
475
|
+
time_to_wait = if frequency == 0
|
476
|
+
0
|
477
|
+
else
|
478
|
+
next_frequency = (now - (now % frequency)) + frequency
|
479
|
+
time_to_wait = [(next_frequency - Time.now.to_f), 0].max
|
480
|
+
end
|
481
|
+
|
482
|
+
command_and_args, command_options = if @event_aggregator&.size.to_i > MAX_AGGREGATOR_SIZE
|
483
|
+
logger.info "Aggregator full, flushing early with #{MAX_AGGREGATOR_SIZE} metrics."
|
484
|
+
command_and_args, command_options = ['forward', {}]
|
485
|
+
else
|
486
|
+
begin
|
487
|
+
with_timeout(time_to_wait) do
|
488
|
+
@aggregator_queue.pop
|
489
|
+
end
|
490
|
+
rescue Timeout::Error
|
491
|
+
['forward', {}]
|
492
|
+
end
|
493
|
+
end
|
494
|
+
if command_and_args
|
495
|
+
sync_resource = command_options && command_options[:sync_resource]
|
496
|
+
case command_and_args
|
497
|
+
when 'exit'
|
498
|
+
logger.info "Exiting, #{@aggregator_queue.size} commands remain"
|
499
|
+
return true
|
500
|
+
when 'flush'
|
501
|
+
if !@event_aggregator.nil?
|
502
|
+
@sender_queue << @event_aggregator
|
503
|
+
@event_aggregator = nil
|
504
|
+
end
|
505
|
+
@sender_queue << ['flush', command_options]
|
506
|
+
when 'forward'
|
507
|
+
if !@event_aggregator.nil?
|
508
|
+
next if @sender_queue.size > 0 && @sender_queue.num_waiting < 1
|
509
|
+
@sender_queue << @event_aggregator
|
510
|
+
@event_aggregator = nil
|
511
|
+
end
|
512
|
+
when Notice
|
513
|
+
@sender_queue << [command_and_args, command_options]
|
514
|
+
else
|
515
|
+
@event_aggregator = EventAggregator.new(frequency: @frequency) if @event_aggregator.nil?
|
516
|
+
|
517
|
+
logger.debug "Sending: #{command_and_args} to aggregator"
|
518
|
+
@event_aggregator.put(command_and_args)
|
519
|
+
end
|
520
|
+
command_and_args = nil
|
521
|
+
command_options = nil
|
522
|
+
end
|
523
|
+
end
|
524
|
+
rescue Exception => err
|
525
|
+
report_exception(err)
|
429
526
|
end
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
"hostname" => HOSTNAME,
|
434
|
-
"pid" => Process.pid,
|
435
|
-
"runtime" => "#{defined?(RUBY_ENGINE) ? RUBY_ENGINE : "ruby"}/#{RUBY_VERSION}p#{RUBY_PATCHLEVEL}",
|
436
|
-
"platform" => RUBY_PLATFORM
|
437
|
-
}.to_a.flatten.map { |v| v.to_s.gsub(/\s+/, "_") }.join(" ")
|
438
|
-
|
439
|
-
send_with_reply_timeout "hello #{hello_options}"
|
440
|
-
send_with_reply_timeout "authenticate #{@api_key}"
|
527
|
+
end
|
528
|
+
|
529
|
+
def run_sender_loop
|
441
530
|
@failures = 0
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
531
|
+
begin
|
532
|
+
logger.info "connecting to collector"
|
533
|
+
command_and_args = nil
|
534
|
+
command_options = nil
|
535
|
+
with_timeout(CONNECT_TIMEOUT) do
|
536
|
+
@socket = open_socket(@sockaddr_in, @secure, @verify_cert)
|
537
|
+
end
|
538
|
+
logger.info "connected to collector at #{host}:#{port}"
|
539
|
+
hello_options = {
|
540
|
+
"version" => "ruby/instrumental_agent/#{VERSION}",
|
541
|
+
"hostname" => HOSTNAME,
|
542
|
+
"pid" => Process.pid,
|
543
|
+
"runtime" => "#{defined?(RUBY_ENGINE) ? RUBY_ENGINE : "ruby"}/#{RUBY_VERSION}p#{RUBY_PATCHLEVEL}",
|
544
|
+
"platform" => RUBY_PLATFORM
|
545
|
+
}.to_a.flatten.map { |v| v.to_s.gsub(/\s+/, "_") }.join(" ")
|
546
|
+
|
547
|
+
send_with_reply_timeout "hello #{hello_options}"
|
548
|
+
send_with_reply_timeout "authenticate #{@api_key}"
|
549
|
+
|
550
|
+
loop do
|
551
|
+
command_and_args, command_options = @sender_queue.pop
|
552
|
+
if command_and_args
|
553
|
+
sync_resource = command_options && command_options[:sync_resource]
|
554
|
+
test_connection
|
555
|
+
case command_and_args
|
556
|
+
when 'exit'
|
557
|
+
logger.info "Exiting, #{@sender_queue.size} commands remain"
|
558
|
+
return true
|
559
|
+
when 'flush'
|
560
|
+
release_resource = true
|
561
|
+
when EventAggregator
|
562
|
+
command_and_args.values.values.each do |command|
|
563
|
+
logger.debug "Sending: #{command}"
|
564
|
+
@socket.puts command
|
565
|
+
end
|
566
|
+
else
|
567
|
+
logger.debug "Sending: #{command_and_args}"
|
568
|
+
@socket.puts command_and_args
|
569
|
+
end
|
570
|
+
command_and_args = nil
|
571
|
+
command_options = nil
|
572
|
+
if sync_resource
|
573
|
+
@sync_mutex.synchronize do
|
574
|
+
sync_resource.signal
|
575
|
+
end
|
462
576
|
end
|
463
577
|
end
|
464
578
|
end
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
when EOFError
|
579
|
+
rescue Exception => err
|
580
|
+
allow_reconnect = @allow_reconnect
|
581
|
+
case err
|
582
|
+
when EOFError
|
470
583
|
# nop
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
584
|
+
when Errno::ECONNREFUSED, Errno::EHOSTUNREACH, Errno::EADDRINUSE, Timeout::Error, OpenSSL::SSL::SSLError
|
585
|
+
# If the connection has been refused by Instrumental
|
586
|
+
# or we cannot reach the server
|
587
|
+
# or the connection state of this socket is in a race
|
588
|
+
# or SSL is not functioning properly for some reason
|
589
|
+
logger.error "unable to connect to Instrumental, hanging up with #{@sender_queue.size} messages remaining"
|
590
|
+
logger.debug "Exception: #{err.inspect}\n#{err.backtrace.join("\n")}"
|
591
|
+
allow_reconnect = false
|
592
|
+
else
|
593
|
+
report_exception(err)
|
594
|
+
end
|
595
|
+
if allow_reconnect == false ||
|
596
|
+
(command_options && command_options[:allow_reconnect] == false)
|
597
|
+
logger.info "Not trying to reconnect"
|
598
|
+
@failures = 0
|
599
|
+
return
|
600
|
+
end
|
601
|
+
if command_and_args
|
602
|
+
logger.debug "requeueing: #{command_and_args}"
|
603
|
+
@sender_queue << command_and_args
|
604
|
+
end
|
605
|
+
disconnect
|
606
|
+
@failures += 1
|
607
|
+
delay = [(@failures - 1) ** BACKOFF, MAX_RECONNECT_DELAY].min
|
608
|
+
logger.error "disconnected, #{@failures} failures in a row, reconnect in #{delay}..."
|
609
|
+
sleep delay
|
610
|
+
retry
|
611
|
+
ensure
|
612
|
+
disconnect
|
489
613
|
end
|
490
|
-
disconnect
|
491
|
-
@failures += 1
|
492
|
-
delay = [(@failures - 1) ** BACKOFF, MAX_RECONNECT_DELAY].min
|
493
|
-
logger.error "disconnected, #{@failures} failures in a row, reconnect in #{delay}..."
|
494
|
-
sleep delay
|
495
|
-
retry
|
496
|
-
ensure
|
497
|
-
disconnect
|
498
614
|
end
|
499
615
|
|
500
616
|
def setup_cleanup_at_exit
|
@@ -504,7 +620,11 @@ module Instrumental
|
|
504
620
|
end
|
505
621
|
|
506
622
|
def running?
|
507
|
-
!@
|
623
|
+
!@sender_thread.nil? &&
|
624
|
+
!@aggregator_thread.nil? &&
|
625
|
+
@pid == Process.pid &&
|
626
|
+
@sender_thread.alive? &&
|
627
|
+
@aggregator_thread.alive?
|
508
628
|
end
|
509
629
|
|
510
630
|
def flush_socket(socket)
|
@@ -534,18 +654,5 @@ module Instrumental
|
|
534
654
|
def allows_secure?
|
535
655
|
defined?(OpenSSL)
|
536
656
|
end
|
537
|
-
|
538
|
-
def certificates
|
539
|
-
if allows_secure?
|
540
|
-
base_dir = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
|
541
|
-
%w{equifax geotrust rapidssl}.map do |name|
|
542
|
-
OpenSSL::X509::Certificate.new(File.open(File.join(base_dir, "certs", "#{name}.ca.pem")))
|
543
|
-
end
|
544
|
-
else
|
545
|
-
[]
|
546
|
-
end
|
547
|
-
end
|
548
|
-
|
549
657
|
end
|
550
|
-
|
551
658
|
end
|