datadog 2.6.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/libdatadog_api/crashtracker.c +6 -4
- data/ext/libdatadog_extconf_helpers.rb +1 -1
- data/lib/datadog/core/configuration/settings.rb +4 -4
- data/lib/datadog/di/code_tracker.rb +30 -3
- data/lib/datadog/di/component.rb +108 -0
- data/lib/datadog/di/configuration/settings.rb +69 -44
- data/lib/datadog/di/contrib/active_record.rb +11 -0
- data/lib/datadog/di/error.rb +17 -0
- data/lib/datadog/di/instrumenter.rb +27 -11
- data/lib/datadog/di/probe.rb +23 -1
- data/lib/datadog/di/probe_manager.rb +246 -0
- data/lib/datadog/di/probe_notification_builder.rb +4 -12
- data/lib/datadog/di/probe_notifier_worker.rb +68 -41
- data/lib/datadog/di/serializer.rb +143 -95
- data/lib/datadog/di/transport.rb +22 -9
- data/lib/datadog/di.rb +49 -1
- data/lib/datadog/version.rb +1 -1
- metadata +10 -7
@@ -0,0 +1,246 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# rubocop:disable Lint/AssignmentInCondition
|
4
|
+
|
5
|
+
require 'monitor'
|
6
|
+
|
7
|
+
module Datadog
|
8
|
+
module DI
|
9
|
+
# Stores probes received from remote config (that we can parse, in other
|
10
|
+
# words, whose type/attributes we support), requests needed instrumentation
|
11
|
+
# for the probes via Instrumenter, and stores pending probes (those which
|
12
|
+
# haven't yet been instrumented successfully due to their targets not
|
13
|
+
# existing) and failed probes (where we are certain the target will not
|
14
|
+
# ever be loaded, or otherwise become valid).
|
15
|
+
#
|
16
|
+
# @api private
|
17
|
+
class ProbeManager
|
18
|
+
def initialize(settings, instrumenter, probe_notification_builder,
|
19
|
+
probe_notifier_worker, logger, telemetry: nil)
|
20
|
+
@settings = settings
|
21
|
+
@instrumenter = instrumenter
|
22
|
+
@probe_notification_builder = probe_notification_builder
|
23
|
+
@probe_notifier_worker = probe_notifier_worker
|
24
|
+
@logger = logger
|
25
|
+
@telemetry = telemetry
|
26
|
+
@installed_probes = {}
|
27
|
+
@pending_probes = {}
|
28
|
+
@failed_probes = {}
|
29
|
+
@lock = Monitor.new
|
30
|
+
|
31
|
+
@definition_trace_point = TracePoint.trace(:end) do |tp|
|
32
|
+
install_pending_method_probes(tp.self)
|
33
|
+
rescue => exc
|
34
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
35
|
+
logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}")
|
36
|
+
telemetry&.report(exc, description: "Unhandled exception in definition trace point")
|
37
|
+
# TODO test this path
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
attr_reader :logger
|
42
|
+
attr_reader :telemetry
|
43
|
+
|
44
|
+
# TODO test that close is called during component teardown and
|
45
|
+
# the trace point is cleared
|
46
|
+
def close
|
47
|
+
definition_trace_point.disable
|
48
|
+
clear_hooks
|
49
|
+
end
|
50
|
+
|
51
|
+
def clear_hooks
|
52
|
+
@lock.synchronize do
|
53
|
+
@pending_probes.clear
|
54
|
+
@installed_probes.each do |probe_id, probe|
|
55
|
+
instrumenter.unhook(probe)
|
56
|
+
end
|
57
|
+
@installed_probes.clear
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :settings
|
62
|
+
attr_reader :instrumenter
|
63
|
+
attr_reader :probe_notification_builder
|
64
|
+
attr_reader :probe_notifier_worker
|
65
|
+
|
66
|
+
def installed_probes
|
67
|
+
@lock.synchronize do
|
68
|
+
@installed_probes
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def pending_probes
|
73
|
+
@lock.synchronize do
|
74
|
+
@pending_probes
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Probes that failed to instrument for reasons other than the target is
|
79
|
+
# not yet loaded are added to this collection, so that we do not try
|
80
|
+
# to instrument them every time remote configuration is processed.
|
81
|
+
def failed_probes
|
82
|
+
@lock.synchronize do
|
83
|
+
@failed_probes
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Requests to install the specified probe.
|
88
|
+
#
|
89
|
+
# If the target of the probe does not exist, assume the relevant
|
90
|
+
# code is not loaded yet (rather than that it will never be loaded),
|
91
|
+
# and store the probe in a pending probe list. When classes are
|
92
|
+
# defined, or files loaded, the probe will be checked against the
|
93
|
+
# newly defined classes/loaded files, and will be installed if it
|
94
|
+
# matches.
|
95
|
+
def add_probe(probe)
|
96
|
+
@lock.synchronize do
|
97
|
+
# Probe failed to install previously, do not try to install it again.
|
98
|
+
if msg = @failed_probes[probe.id]
|
99
|
+
# TODO test this path
|
100
|
+
raise Error::ProbePreviouslyFailed, msg
|
101
|
+
end
|
102
|
+
|
103
|
+
begin
|
104
|
+
instrumenter.hook(probe, &method(:probe_executed_callback))
|
105
|
+
|
106
|
+
@installed_probes[probe.id] = probe
|
107
|
+
payload = probe_notification_builder.build_installed(probe)
|
108
|
+
probe_notifier_worker.add_status(payload)
|
109
|
+
# The probe would only be in the pending probes list if it was
|
110
|
+
# previously attempted to be installed and the target was not loaded.
|
111
|
+
# Always remove from pending list here because it makes the
|
112
|
+
# API smaller and shouldn't cause any actual problems.
|
113
|
+
@pending_probes.delete(probe.id)
|
114
|
+
true
|
115
|
+
rescue Error::DITargetNotDefined
|
116
|
+
@pending_probes[probe.id] = probe
|
117
|
+
false
|
118
|
+
end
|
119
|
+
rescue => exc
|
120
|
+
# In "propagate all exceptions" mode we will try to instrument again.
|
121
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
122
|
+
|
123
|
+
logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
|
124
|
+
telemetry&.report(exc, description: "Error processing probe configuration")
|
125
|
+
# TODO report probe as failed to agent since we won't attempt to
|
126
|
+
# install it again.
|
127
|
+
|
128
|
+
# TODO add top stack frame to message
|
129
|
+
@failed_probes[probe.id] = "#{exc.class}: #{exc}"
|
130
|
+
|
131
|
+
raise
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Removes probes with ids other than in the specified list.
|
136
|
+
#
|
137
|
+
# This method is meant to be invoked from remote config processor.
|
138
|
+
# Remote config contains the list of currently defined probes; any
|
139
|
+
# probes not in that list have been removed by user and should be
|
140
|
+
# de-instrumented from the application.
|
141
|
+
def remove_other_probes(probe_ids)
|
142
|
+
@lock.synchronize do
|
143
|
+
@pending_probes.values.each do |probe|
|
144
|
+
unless probe_ids.include?(probe.id)
|
145
|
+
@pending_probes.delete(probe.id)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
@installed_probes.values.each do |probe|
|
149
|
+
unless probe_ids.include?(probe.id)
|
150
|
+
begin
|
151
|
+
instrumenter.unhook(probe)
|
152
|
+
# Only remove the probe from installed list if it was
|
153
|
+
# successfully de-instrumented. Active probes do incur overhead
|
154
|
+
# for the running application, and if the error is ephemeral
|
155
|
+
# we want to try removing the probe again at the next opportunity.
|
156
|
+
#
|
157
|
+
# TODO give up after some time?
|
158
|
+
@installed_probes.delete(probe.id)
|
159
|
+
rescue => exc
|
160
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
161
|
+
# Silence all exceptions?
|
162
|
+
# TODO should we propagate here and rescue upstream?
|
163
|
+
logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}")
|
164
|
+
telemetry&.report(exc, description: "Error removing probe #{probe.id}")
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# Installs pending method probes, if any, for the specified class.
|
172
|
+
#
|
173
|
+
# This method is meant to be called from the "end" trace point,
|
174
|
+
# which is invoked for each class definition.
|
175
|
+
private def install_pending_method_probes(cls)
|
176
|
+
@lock.synchronize do
|
177
|
+
# TODO search more efficiently than linearly
|
178
|
+
@pending_probes.each do |probe_id, probe|
|
179
|
+
if probe.method?
|
180
|
+
# TODO move this stringification elsewhere
|
181
|
+
if probe.type_name == cls.name
|
182
|
+
begin
|
183
|
+
# TODO is it OK to hook from trace point handler?
|
184
|
+
# TODO the class is now defined, but can hooking still fail?
|
185
|
+
instrumenter.hook(probe, &method(:probe_executed_callback))
|
186
|
+
@pending_probes.delete(probe.id)
|
187
|
+
break
|
188
|
+
rescue Error::DITargetNotDefined
|
189
|
+
# This should not happen... try installing again later?
|
190
|
+
rescue => exc
|
191
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
192
|
+
|
193
|
+
logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}")
|
194
|
+
telemetry&.report(exc, description: "Error installing probe after class is defined")
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# Installs pending line probes, if any, for the file of the specified
|
203
|
+
# absolute path.
|
204
|
+
#
|
205
|
+
# This method is meant to be called from the script_compiled trace
|
206
|
+
# point, which is invoked for each required or loaded file
|
207
|
+
# (and also for eval'd code, but those invocations are filtered out).
|
208
|
+
def install_pending_line_probes(path)
|
209
|
+
@lock.synchronize do
|
210
|
+
@pending_probes.values.each do |probe|
|
211
|
+
if probe.line?
|
212
|
+
if probe.file_matches?(path)
|
213
|
+
add_probe(probe)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# Entry point invoked from the instrumentation when the specfied probe
|
221
|
+
# is invoked (that is, either its target method is invoked, or
|
222
|
+
# execution reached its target file/line).
|
223
|
+
#
|
224
|
+
# This method is responsible for queueing probe status to be sent to the
|
225
|
+
# backend (once per the probe's lifetime) and a snapshot corresponding
|
226
|
+
# to the current invocation.
|
227
|
+
def probe_executed_callback(probe:, **opts)
|
228
|
+
unless probe.emitting_notified?
|
229
|
+
payload = probe_notification_builder.build_emitting(probe)
|
230
|
+
probe_notifier_worker.add_status(payload)
|
231
|
+
probe.emitting_notified = true
|
232
|
+
end
|
233
|
+
|
234
|
+
payload = probe_notification_builder.build_executed(probe, **opts)
|
235
|
+
probe_notifier_worker.add_snapshot(payload)
|
236
|
+
end
|
237
|
+
|
238
|
+
# Class/module definition trace point (:end type).
|
239
|
+
# Used to install hooks when the target classes/modules aren't yet
|
240
|
+
# defined when the hook request is received.
|
241
|
+
attr_reader :definition_trace_point
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# rubocop:enable Lint/AssignmentInCondition
|
@@ -46,11 +46,13 @@ module Datadog
|
|
46
46
|
# this should be all frames for enriched probes and no frames for
|
47
47
|
# non-enriched probes?
|
48
48
|
build_snapshot(probe, rv: rv, snapshot: snapshot,
|
49
|
+
# Actual path of the instrumented file.
|
50
|
+
path: trace_point&.path,
|
49
51
|
duration: duration, caller_locations: caller_locations, args: args, kwargs: kwargs,
|
50
52
|
serialized_entry_args: serialized_entry_args)
|
51
53
|
end
|
52
54
|
|
53
|
-
def build_snapshot(probe, rv: nil, snapshot: nil,
|
55
|
+
def build_snapshot(probe, rv: nil, snapshot: nil, path: nil,
|
54
56
|
duration: nil, caller_locations: nil, args: nil, kwargs: nil,
|
55
57
|
serialized_entry_args: nil)
|
56
58
|
# TODO also verify that non-capturing probe does not pass
|
@@ -85,18 +87,8 @@ module Datadog
|
|
85
87
|
end
|
86
88
|
|
87
89
|
location = if probe.line?
|
88
|
-
actual_file = if probe.file
|
89
|
-
# Normally caller_locations should always be filled for a line probe
|
90
|
-
# but in the test suite we don't always provide all arguments.
|
91
|
-
actual_file_basename = File.basename(probe.file)
|
92
|
-
caller_locations&.detect do |loc|
|
93
|
-
# TODO record actual path that probe was installed into,
|
94
|
-
# perform exact match here against that path.
|
95
|
-
File.basename(loc.path) == actual_file_basename
|
96
|
-
end&.path || probe.file
|
97
|
-
end
|
98
90
|
{
|
99
|
-
file:
|
91
|
+
file: path,
|
100
92
|
lines: [probe.line_no],
|
101
93
|
}
|
102
94
|
elsif probe.method?
|
@@ -23,12 +23,9 @@ module Datadog
|
|
23
23
|
#
|
24
24
|
# @api private
|
25
25
|
class ProbeNotifierWorker
|
26
|
-
|
27
|
-
# TODO make this into an internal setting and increase default to 2 or 3.
|
28
|
-
MIN_SEND_INTERVAL = 1
|
29
|
-
|
30
|
-
def initialize(settings, transport, logger)
|
26
|
+
def initialize(settings, transport, logger, telemetry: nil)
|
31
27
|
@settings = settings
|
28
|
+
@telemetry = telemetry
|
32
29
|
@status_queue = []
|
33
30
|
@snapshot_queue = []
|
34
31
|
@transport = transport
|
@@ -39,10 +36,12 @@ module Datadog
|
|
39
36
|
@sleep_remaining = nil
|
40
37
|
@wake_scheduled = false
|
41
38
|
@thread = nil
|
39
|
+
@flush = 0
|
42
40
|
end
|
43
41
|
|
44
42
|
attr_reader :settings
|
45
43
|
attr_reader :logger
|
44
|
+
attr_reader :telemetry
|
46
45
|
|
47
46
|
def start
|
48
47
|
return if @thread
|
@@ -53,33 +52,38 @@ module Datadog
|
|
53
52
|
# and then quit?
|
54
53
|
break if @stop_requested
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
55
|
+
# If a flush was requested, send immediately and do not
|
56
|
+
# wait for the cooldown period.
|
57
|
+
if @lock.synchronize { @flush } == 0
|
58
|
+
sleep_remaining = @lock.synchronize do
|
59
|
+
if sleep_remaining && sleep_remaining > 0
|
60
|
+
# Recalculate how much sleep time is remaining, then sleep that long.
|
61
|
+
set_sleep_remaining
|
62
|
+
else
|
63
|
+
0
|
64
|
+
end
|
62
65
|
end
|
63
|
-
end
|
64
66
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
67
|
+
if sleep_remaining > 0
|
68
|
+
# Do not need to update @wake_scheduled here because
|
69
|
+
# wake-up is already scheduled for the earliest possible time.
|
70
|
+
wake.wait(sleep_remaining)
|
71
|
+
next
|
72
|
+
end
|
70
73
|
end
|
71
74
|
|
72
75
|
begin
|
73
76
|
more = maybe_send
|
74
77
|
rescue => exc
|
75
|
-
raise if settings.dynamic_instrumentation.propagate_all_exceptions
|
78
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
76
79
|
|
77
80
|
logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
|
81
|
+
telemetry&.report(exc, description: "Error in probe notifier worker")
|
78
82
|
end
|
79
83
|
@lock.synchronize do
|
80
84
|
@wake_scheduled = more
|
81
85
|
end
|
82
|
-
wake.wait(more ?
|
86
|
+
wake.wait(more ? min_send_interval : nil)
|
83
87
|
end
|
84
88
|
end
|
85
89
|
end
|
@@ -106,26 +110,40 @@ module Datadog
|
|
106
110
|
# therefore, it should only be called when there is no parallel
|
107
111
|
# activity (in another thread) that causes more notifications
|
108
112
|
# to be generated.
|
113
|
+
#
|
114
|
+
# This method is used by the test suite to wait until notifications have
|
115
|
+
# been sent out, and could be used for graceful stopping of the
|
116
|
+
# worker thread.
|
109
117
|
def flush
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
118
|
+
@lock.synchronize do
|
119
|
+
@flush += 1
|
120
|
+
end
|
121
|
+
begin
|
122
|
+
loop do
|
123
|
+
if @thread.nil? || !@thread.alive?
|
124
|
+
return
|
125
|
+
end
|
114
126
|
|
115
|
-
|
116
|
-
|
117
|
-
|
127
|
+
io_in_progress, queues_empty = @lock.synchronize do
|
128
|
+
[io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
|
129
|
+
end
|
118
130
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
131
|
+
if io_in_progress
|
132
|
+
# If we just call Thread.pass we could be in a busy loop -
|
133
|
+
# add a sleep.
|
134
|
+
sleep 0.25
|
135
|
+
next
|
136
|
+
elsif queues_empty
|
137
|
+
break
|
138
|
+
else
|
139
|
+
wake.signal
|
140
|
+
sleep 0.25
|
141
|
+
next
|
142
|
+
end
|
143
|
+
end
|
144
|
+
ensure
|
145
|
+
@lock.synchronize do
|
146
|
+
@flush -= 1
|
129
147
|
end
|
130
148
|
end
|
131
149
|
end
|
@@ -136,6 +154,11 @@ module Datadog
|
|
136
154
|
attr_reader :wake
|
137
155
|
attr_reader :thread
|
138
156
|
|
157
|
+
# Convenience method to keep line length reasonable in the rest of the file.
|
158
|
+
def min_send_interval
|
159
|
+
settings.dynamic_instrumentation.internal.min_send_interval
|
160
|
+
end
|
161
|
+
|
139
162
|
# This method should be called while @lock is held.
|
140
163
|
def io_in_progress?
|
141
164
|
@io_in_progress
|
@@ -181,14 +204,14 @@ module Datadog
|
|
181
204
|
end
|
182
205
|
|
183
206
|
# Determine how much longer the worker thread should sleep
|
184
|
-
# so as not to send in less than
|
207
|
+
# so as not to send in less than min send interval since the last send.
|
185
208
|
# Important: this method must be called when @lock is held.
|
186
209
|
#
|
187
210
|
# Returns the time remaining to sleep.
|
188
211
|
def set_sleep_remaining
|
189
212
|
now = Core::Utils::Time.get_time
|
190
213
|
@sleep_remaining = if last_sent
|
191
|
-
[last_sent +
|
214
|
+
[last_sent + min_send_interval - now, 0].max
|
192
215
|
else
|
193
216
|
0
|
194
217
|
end
|
@@ -218,16 +241,20 @@ module Datadog
|
|
218
241
|
@last_sent = time
|
219
242
|
end
|
220
243
|
rescue => exc
|
221
|
-
raise if settings.dynamic_instrumentation.propagate_all_exceptions
|
244
|
+
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
|
222
245
|
logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
|
246
|
+
# Should we report this error to telemetry? Most likely failure
|
247
|
+
# to send is due to a network issue, and trying to send a
|
248
|
+
# telemetry message would also fail.
|
223
249
|
end
|
224
250
|
end
|
225
251
|
batch.any? # steep:ignore
|
226
|
-
rescue ThreadError
|
252
|
+
rescue ThreadError => exc
|
227
253
|
# Normally the queue should only be consumed in this method,
|
228
254
|
# however if anyone consumes it elsewhere we don't want to block
|
229
255
|
# while consuming it here. Rescue ThreadError and return.
|
230
|
-
logger.warn("
|
256
|
+
logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?")
|
257
|
+
telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow")
|
231
258
|
ensure
|
232
259
|
@lock.synchronize do
|
233
260
|
@io_in_progress = false
|