datadog 2.6.0 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/libdatadog_api/crashtracker.c +6 -4
- data/ext/libdatadog_extconf_helpers.rb +1 -1
- data/lib/datadog/core/configuration/settings.rb +4 -4
- data/lib/datadog/di/code_tracker.rb +30 -3
- data/lib/datadog/di/component.rb +108 -0
- data/lib/datadog/di/configuration/settings.rb +69 -44
- data/lib/datadog/di/contrib/active_record.rb +11 -0
- data/lib/datadog/di/error.rb +17 -0
- data/lib/datadog/di/instrumenter.rb +27 -11
- data/lib/datadog/di/probe.rb +23 -1
- data/lib/datadog/di/probe_manager.rb +246 -0
- data/lib/datadog/di/probe_notification_builder.rb +4 -12
- data/lib/datadog/di/probe_notifier_worker.rb +68 -41
- data/lib/datadog/di/serializer.rb +143 -95
- data/lib/datadog/di/transport.rb +22 -9
- data/lib/datadog/di.rb +49 -1
- data/lib/datadog/version.rb +1 -1
- metadata +10 -7
@@ -0,0 +1,246 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# rubocop:disable Lint/AssignmentInCondition
|
4
|
+
|
5
|
+
require 'monitor'
|
6
|
+
|
7
|
+
module Datadog
|
8
|
+
module DI
|
9
|
+
# Stores probes received from remote config (that we can parse, in other
|
10
|
+
# words, whose type/attributes we support), requests needed instrumentation
|
11
|
+
# for the probes via Instrumenter, and stores pending probes (those which
|
12
|
+
# haven't yet been instrumented successfully due to their targets not
|
13
|
+
# existing) and failed probes (where we are certain the target will not
|
14
|
+
# ever be loaded, or otherwise become valid).
|
15
|
+
#
|
16
|
+
# @api private
|
17
|
+
class ProbeManager
|
18
|
+
def initialize(settings, instrumenter, probe_notification_builder,
|
19
|
+
probe_notifier_worker, logger, telemetry: nil)
|
20
|
+
@settings = settings
|
21
|
+
@instrumenter = instrumenter
|
22
|
+
@probe_notification_builder = probe_notification_builder
|
23
|
+
@probe_notifier_worker = probe_notifier_worker
|
24
|
+
@logger = logger
|
25
|
+
@telemetry = telemetry
|
26
|
+
@installed_probes = {}
|
27
|
+
@pending_probes = {}
|
28
|
+
@failed_probes = {}
|
29
|
+
@lock = Monitor.new
|
30
|
+
|
31
|
+
@definition_trace_point = TracePoint.trace(:end) do |tp|
|
32
|
+
install_pending_method_probes(tp.self)
|
33
|
+
rescue => exc
|
34
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
35
|
+
logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}")
|
36
|
+
telemetry&.report(exc, description: "Unhandled exception in definition trace point")
|
37
|
+
# TODO test this path
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
attr_reader :logger
|
42
|
+
attr_reader :telemetry
|
43
|
+
|
44
|
+
# TODO test that close is called during component teardown and
|
45
|
+
# the trace point is cleared
|
46
|
+
def close
|
47
|
+
definition_trace_point.disable
|
48
|
+
clear_hooks
|
49
|
+
end
|
50
|
+
|
51
|
+
def clear_hooks
|
52
|
+
@lock.synchronize do
|
53
|
+
@pending_probes.clear
|
54
|
+
@installed_probes.each do |probe_id, probe|
|
55
|
+
instrumenter.unhook(probe)
|
56
|
+
end
|
57
|
+
@installed_probes.clear
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :settings
|
62
|
+
attr_reader :instrumenter
|
63
|
+
attr_reader :probe_notification_builder
|
64
|
+
attr_reader :probe_notifier_worker
|
65
|
+
|
66
|
+
def installed_probes
|
67
|
+
@lock.synchronize do
|
68
|
+
@installed_probes
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def pending_probes
|
73
|
+
@lock.synchronize do
|
74
|
+
@pending_probes
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Probes that failed to instrument for reasons other than the target is
|
79
|
+
# not yet loaded are added to this collection, so that we do not try
|
80
|
+
# to instrument them every time remote configuration is processed.
|
81
|
+
def failed_probes
|
82
|
+
@lock.synchronize do
|
83
|
+
@failed_probes
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Requests to install the specified probe.
|
88
|
+
#
|
89
|
+
# If the target of the probe does not exist, assume the relevant
|
90
|
+
# code is not loaded yet (rather than that it will never be loaded),
|
91
|
+
# and store the probe in a pending probe list. When classes are
|
92
|
+
# defined, or files loaded, the probe will be checked against the
|
93
|
+
# newly defined classes/loaded files, and will be installed if it
|
94
|
+
# matches.
|
95
|
+
def add_probe(probe)
|
96
|
+
@lock.synchronize do
|
97
|
+
# Probe failed to install previously, do not try to install it again.
|
98
|
+
if msg = @failed_probes[probe.id]
|
99
|
+
# TODO test this path
|
100
|
+
raise Error::ProbePreviouslyFailed, msg
|
101
|
+
end
|
102
|
+
|
103
|
+
begin
|
104
|
+
instrumenter.hook(probe, &method(:probe_executed_callback))
|
105
|
+
|
106
|
+
@installed_probes[probe.id] = probe
|
107
|
+
payload = probe_notification_builder.build_installed(probe)
|
108
|
+
probe_notifier_worker.add_status(payload)
|
109
|
+
# The probe would only be in the pending probes list if it was
|
110
|
+
# previously attempted to be installed and the target was not loaded.
|
111
|
+
# Always remove from pending list here because it makes the
|
112
|
+
# API smaller and shouldn't cause any actual problems.
|
113
|
+
@pending_probes.delete(probe.id)
|
114
|
+
true
|
115
|
+
rescue Error::DITargetNotDefined
|
116
|
+
@pending_probes[probe.id] = probe
|
117
|
+
false
|
118
|
+
end
|
119
|
+
rescue => exc
|
120
|
+
# In "propagate all exceptions" mode we will try to instrument again.
|
121
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
122
|
+
|
123
|
+
logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
|
124
|
+
telemetry&.report(exc, description: "Error processing probe configuration")
|
125
|
+
# TODO report probe as failed to agent since we won't attempt to
|
126
|
+
# install it again.
|
127
|
+
|
128
|
+
# TODO add top stack frame to message
|
129
|
+
@failed_probes[probe.id] = "#{exc.class}: #{exc}"
|
130
|
+
|
131
|
+
raise
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Removes probes with ids other than in the specified list.
|
136
|
+
#
|
137
|
+
# This method is meant to be invoked from remote config processor.
|
138
|
+
# Remote config contains the list of currently defined probes; any
|
139
|
+
# probes not in that list have been removed by user and should be
|
140
|
+
# de-instrumented from the application.
|
141
|
+
def remove_other_probes(probe_ids)
|
142
|
+
@lock.synchronize do
|
143
|
+
@pending_probes.values.each do |probe|
|
144
|
+
unless probe_ids.include?(probe.id)
|
145
|
+
@pending_probes.delete(probe.id)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
@installed_probes.values.each do |probe|
|
149
|
+
unless probe_ids.include?(probe.id)
|
150
|
+
begin
|
151
|
+
instrumenter.unhook(probe)
|
152
|
+
# Only remove the probe from installed list if it was
|
153
|
+
# successfully de-instrumented. Active probes do incur overhead
|
154
|
+
# for the running application, and if the error is ephemeral
|
155
|
+
# we want to try removing the probe again at the next opportunity.
|
156
|
+
#
|
157
|
+
# TODO give up after some time?
|
158
|
+
@installed_probes.delete(probe.id)
|
159
|
+
rescue => exc
|
160
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
161
|
+
# Silence all exceptions?
|
162
|
+
# TODO should we propagate here and rescue upstream?
|
163
|
+
logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}")
|
164
|
+
telemetry&.report(exc, description: "Error removing probe #{probe.id}")
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# Installs pending method probes, if any, for the specified class.
|
172
|
+
#
|
173
|
+
# This method is meant to be called from the "end" trace point,
|
174
|
+
# which is invoked for each class definition.
|
175
|
+
private def install_pending_method_probes(cls)
|
176
|
+
@lock.synchronize do
|
177
|
+
# TODO search more efficiently than linearly
|
178
|
+
@pending_probes.each do |probe_id, probe|
|
179
|
+
if probe.method?
|
180
|
+
# TODO move this stringification elsewhere
|
181
|
+
if probe.type_name == cls.name
|
182
|
+
begin
|
183
|
+
# TODO is it OK to hook from trace point handler?
|
184
|
+
# TODO the class is now defined, but can hooking still fail?
|
185
|
+
instrumenter.hook(probe, &method(:probe_executed_callback))
|
186
|
+
@pending_probes.delete(probe.id)
|
187
|
+
break
|
188
|
+
rescue Error::DITargetNotDefined
|
189
|
+
# This should not happen... try installing again later?
|
190
|
+
rescue => exc
|
191
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
192
|
+
|
193
|
+
logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}")
|
194
|
+
telemetry&.report(exc, description: "Error installing probe after class is defined")
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# Installs pending line probes, if any, for the file of the specified
|
203
|
+
# absolute path.
|
204
|
+
#
|
205
|
+
# This method is meant to be called from the script_compiled trace
|
206
|
+
# point, which is invoked for each required or loaded file
|
207
|
+
# (and also for eval'd code, but those invocations are filtered out).
|
208
|
+
def install_pending_line_probes(path)
|
209
|
+
@lock.synchronize do
|
210
|
+
@pending_probes.values.each do |probe|
|
211
|
+
if probe.line?
|
212
|
+
if probe.file_matches?(path)
|
213
|
+
add_probe(probe)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# Entry point invoked from the instrumentation when the specfied probe
|
221
|
+
# is invoked (that is, either its target method is invoked, or
|
222
|
+
# execution reached its target file/line).
|
223
|
+
#
|
224
|
+
# This method is responsible for queueing probe status to be sent to the
|
225
|
+
# backend (once per the probe's lifetime) and a snapshot corresponding
|
226
|
+
# to the current invocation.
|
227
|
+
def probe_executed_callback(probe:, **opts)
|
228
|
+
unless probe.emitting_notified?
|
229
|
+
payload = probe_notification_builder.build_emitting(probe)
|
230
|
+
probe_notifier_worker.add_status(payload)
|
231
|
+
probe.emitting_notified = true
|
232
|
+
end
|
233
|
+
|
234
|
+
payload = probe_notification_builder.build_executed(probe, **opts)
|
235
|
+
probe_notifier_worker.add_snapshot(payload)
|
236
|
+
end
|
237
|
+
|
238
|
+
# Class/module definition trace point (:end type).
|
239
|
+
# Used to install hooks when the target classes/modules aren't yet
|
240
|
+
# defined when the hook request is received.
|
241
|
+
attr_reader :definition_trace_point
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# rubocop:enable Lint/AssignmentInCondition
|
@@ -46,11 +46,13 @@ module Datadog
|
|
46
46
|
# this should be all frames for enriched probes and no frames for
|
47
47
|
# non-enriched probes?
|
48
48
|
build_snapshot(probe, rv: rv, snapshot: snapshot,
|
49
|
+
# Actual path of the instrumented file.
|
50
|
+
path: trace_point&.path,
|
49
51
|
duration: duration, caller_locations: caller_locations, args: args, kwargs: kwargs,
|
50
52
|
serialized_entry_args: serialized_entry_args)
|
51
53
|
end
|
52
54
|
|
53
|
-
def build_snapshot(probe, rv: nil, snapshot: nil,
|
55
|
+
def build_snapshot(probe, rv: nil, snapshot: nil, path: nil,
|
54
56
|
duration: nil, caller_locations: nil, args: nil, kwargs: nil,
|
55
57
|
serialized_entry_args: nil)
|
56
58
|
# TODO also verify that non-capturing probe does not pass
|
@@ -85,18 +87,8 @@ module Datadog
|
|
85
87
|
end
|
86
88
|
|
87
89
|
location = if probe.line?
|
88
|
-
actual_file = if probe.file
|
89
|
-
# Normally caller_locations should always be filled for a line probe
|
90
|
-
# but in the test suite we don't always provide all arguments.
|
91
|
-
actual_file_basename = File.basename(probe.file)
|
92
|
-
caller_locations&.detect do |loc|
|
93
|
-
# TODO record actual path that probe was installed into,
|
94
|
-
# perform exact match here against that path.
|
95
|
-
File.basename(loc.path) == actual_file_basename
|
96
|
-
end&.path || probe.file
|
97
|
-
end
|
98
90
|
{
|
99
|
-
file:
|
91
|
+
file: path,
|
100
92
|
lines: [probe.line_no],
|
101
93
|
}
|
102
94
|
elsif probe.method?
|
@@ -23,12 +23,9 @@ module Datadog
|
|
23
23
|
#
|
24
24
|
# @api private
|
25
25
|
class ProbeNotifierWorker
|
26
|
-
|
27
|
-
# TODO make this into an internal setting and increase default to 2 or 3.
|
28
|
-
MIN_SEND_INTERVAL = 1
|
29
|
-
|
30
|
-
def initialize(settings, transport, logger)
|
26
|
+
def initialize(settings, transport, logger, telemetry: nil)
|
31
27
|
@settings = settings
|
28
|
+
@telemetry = telemetry
|
32
29
|
@status_queue = []
|
33
30
|
@snapshot_queue = []
|
34
31
|
@transport = transport
|
@@ -39,10 +36,12 @@ module Datadog
|
|
39
36
|
@sleep_remaining = nil
|
40
37
|
@wake_scheduled = false
|
41
38
|
@thread = nil
|
39
|
+
@flush = 0
|
42
40
|
end
|
43
41
|
|
44
42
|
attr_reader :settings
|
45
43
|
attr_reader :logger
|
44
|
+
attr_reader :telemetry
|
46
45
|
|
47
46
|
def start
|
48
47
|
return if @thread
|
@@ -53,33 +52,38 @@ module Datadog
|
|
53
52
|
# and then quit?
|
54
53
|
break if @stop_requested
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
55
|
+
# If a flush was requested, send immediately and do not
|
56
|
+
# wait for the cooldown period.
|
57
|
+
if @lock.synchronize { @flush } == 0
|
58
|
+
sleep_remaining = @lock.synchronize do
|
59
|
+
if sleep_remaining && sleep_remaining > 0
|
60
|
+
# Recalculate how much sleep time is remaining, then sleep that long.
|
61
|
+
set_sleep_remaining
|
62
|
+
else
|
63
|
+
0
|
64
|
+
end
|
62
65
|
end
|
63
|
-
end
|
64
66
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
67
|
+
if sleep_remaining > 0
|
68
|
+
# Do not need to update @wake_scheduled here because
|
69
|
+
# wake-up is already scheduled for the earliest possible time.
|
70
|
+
wake.wait(sleep_remaining)
|
71
|
+
next
|
72
|
+
end
|
70
73
|
end
|
71
74
|
|
72
75
|
begin
|
73
76
|
more = maybe_send
|
74
77
|
rescue => exc
|
75
|
-
raise if settings.dynamic_instrumentation.propagate_all_exceptions
|
78
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
76
79
|
|
77
80
|
logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
|
81
|
+
telemetry&.report(exc, description: "Error in probe notifier worker")
|
78
82
|
end
|
79
83
|
@lock.synchronize do
|
80
84
|
@wake_scheduled = more
|
81
85
|
end
|
82
|
-
wake.wait(more ?
|
86
|
+
wake.wait(more ? min_send_interval : nil)
|
83
87
|
end
|
84
88
|
end
|
85
89
|
end
|
@@ -106,26 +110,40 @@ module Datadog
|
|
106
110
|
# therefore, it should only be called when there is no parallel
|
107
111
|
# activity (in another thread) that causes more notifications
|
108
112
|
# to be generated.
|
113
|
+
#
|
114
|
+
# This method is used by the test suite to wait until notifications have
|
115
|
+
# been sent out, and could be used for graceful stopping of the
|
116
|
+
# worker thread.
|
109
117
|
def flush
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
118
|
+
@lock.synchronize do
|
119
|
+
@flush += 1
|
120
|
+
end
|
121
|
+
begin
|
122
|
+
loop do
|
123
|
+
if @thread.nil? || !@thread.alive?
|
124
|
+
return
|
125
|
+
end
|
114
126
|
|
115
|
-
|
116
|
-
|
117
|
-
|
127
|
+
io_in_progress, queues_empty = @lock.synchronize do
|
128
|
+
[io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
|
129
|
+
end
|
118
130
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
131
|
+
if io_in_progress
|
132
|
+
# If we just call Thread.pass we could be in a busy loop -
|
133
|
+
# add a sleep.
|
134
|
+
sleep 0.25
|
135
|
+
next
|
136
|
+
elsif queues_empty
|
137
|
+
break
|
138
|
+
else
|
139
|
+
wake.signal
|
140
|
+
sleep 0.25
|
141
|
+
next
|
142
|
+
end
|
143
|
+
end
|
144
|
+
ensure
|
145
|
+
@lock.synchronize do
|
146
|
+
@flush -= 1
|
129
147
|
end
|
130
148
|
end
|
131
149
|
end
|
@@ -136,6 +154,11 @@ module Datadog
|
|
136
154
|
attr_reader :wake
|
137
155
|
attr_reader :thread
|
138
156
|
|
157
|
+
# Convenience method to keep line length reasonable in the rest of the file.
|
158
|
+
def min_send_interval
|
159
|
+
settings.dynamic_instrumentation.internal.min_send_interval
|
160
|
+
end
|
161
|
+
|
139
162
|
# This method should be called while @lock is held.
|
140
163
|
def io_in_progress?
|
141
164
|
@io_in_progress
|
@@ -181,14 +204,14 @@ module Datadog
|
|
181
204
|
end
|
182
205
|
|
183
206
|
# Determine how much longer the worker thread should sleep
|
184
|
-
# so as not to send in less than
|
207
|
+
# so as not to send in less than min send interval since the last send.
|
185
208
|
# Important: this method must be called when @lock is held.
|
186
209
|
#
|
187
210
|
# Returns the time remaining to sleep.
|
188
211
|
def set_sleep_remaining
|
189
212
|
now = Core::Utils::Time.get_time
|
190
213
|
@sleep_remaining = if last_sent
|
191
|
-
[last_sent +
|
214
|
+
[last_sent + min_send_interval - now, 0].max
|
192
215
|
else
|
193
216
|
0
|
194
217
|
end
|
@@ -218,16 +241,20 @@ module Datadog
|
|
218
241
|
@last_sent = time
|
219
242
|
end
|
220
243
|
rescue => exc
|
221
|
-
raise if settings.dynamic_instrumentation.propagate_all_exceptions
|
244
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
222
245
|
logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
|
246
|
+
# Should we report this error to telemetry? Most likely failure
|
247
|
+
# to send is due to a network issue, and trying to send a
|
248
|
+
# telemetry message would also fail.
|
223
249
|
end
|
224
250
|
end
|
225
251
|
batch.any? # steep:ignore
|
226
|
-
rescue ThreadError
|
252
|
+
rescue ThreadError => exc
|
227
253
|
# Normally the queue should only be consumed in this method,
|
228
254
|
# however if anyone consumes it elsewhere we don't want to block
|
229
255
|
# while consuming it here. Rescue ThreadError and return.
|
230
|
-
logger.warn("
|
256
|
+
logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?")
|
257
|
+
telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow")
|
231
258
|
ensure
|
232
259
|
@lock.synchronize do
|
233
260
|
@io_in_progress = false
|