datadog 2.5.0 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,246 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Lint/AssignmentInCondition
4
+
5
+ require 'monitor'
6
+
7
+ module Datadog
8
+ module DI
9
+ # Stores probes received from remote config (that we can parse, in other
10
+ # words, whose type/attributes we support), requests needed instrumentation
11
+ # for the probes via Instrumenter, and stores pending probes (those which
12
+ # haven't yet been instrumented successfully due to their targets not
13
+ # existing) and failed probes (where we are certain the target will not
14
+ # ever be loaded, or otherwise become valid).
15
+ #
16
+ # @api private
17
+ class ProbeManager
18
+ def initialize(settings, instrumenter, probe_notification_builder,
19
+ probe_notifier_worker, logger, telemetry: nil)
20
+ @settings = settings
21
+ @instrumenter = instrumenter
22
+ @probe_notification_builder = probe_notification_builder
23
+ @probe_notifier_worker = probe_notifier_worker
24
+ @logger = logger
25
+ @telemetry = telemetry
26
+ @installed_probes = {}
27
+ @pending_probes = {}
28
+ @failed_probes = {}
29
+ @lock = Monitor.new
30
+
31
+ @definition_trace_point = TracePoint.trace(:end) do |tp|
32
+ install_pending_method_probes(tp.self)
33
+ rescue => exc
34
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
35
+ logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}")
36
+ telemetry&.report(exc, description: "Unhandled exception in definition trace point")
37
+ # TODO test this path
38
+ end
39
+ end
40
+
41
+ attr_reader :logger
42
+ attr_reader :telemetry
43
+
44
+ # TODO test that close is called during component teardown and
45
+ # the trace point is cleared
46
+ def close
47
+ definition_trace_point.disable
48
+ clear_hooks
49
+ end
50
+
51
+ def clear_hooks
52
+ @lock.synchronize do
53
+ @pending_probes.clear
54
+ @installed_probes.each do |probe_id, probe|
55
+ instrumenter.unhook(probe)
56
+ end
57
+ @installed_probes.clear
58
+ end
59
+ end
60
+
61
+ attr_reader :settings
62
+ attr_reader :instrumenter
63
+ attr_reader :probe_notification_builder
64
+ attr_reader :probe_notifier_worker
65
+
66
+ def installed_probes
67
+ @lock.synchronize do
68
+ @installed_probes
69
+ end
70
+ end
71
+
72
+ def pending_probes
73
+ @lock.synchronize do
74
+ @pending_probes
75
+ end
76
+ end
77
+
78
+ # Probes that failed to instrument for reasons other than the target is
79
+ # not yet loaded are added to this collection, so that we do not try
80
+ # to instrument them every time remote configuration is processed.
81
+ def failed_probes
82
+ @lock.synchronize do
83
+ @failed_probes
84
+ end
85
+ end
86
+
87
+ # Requests to install the specified probe.
88
+ #
89
+ # If the target of the probe does not exist, assume the relevant
90
+ # code is not loaded yet (rather than that it will never be loaded),
91
+ # and store the probe in a pending probe list. When classes are
92
+ # defined, or files loaded, the probe will be checked against the
93
+ # newly defined classes/loaded files, and will be installed if it
94
+ # matches.
95
+ def add_probe(probe)
96
+ @lock.synchronize do
97
+ # Probe failed to install previously, do not try to install it again.
98
+ if msg = @failed_probes[probe.id]
99
+ # TODO test this path
100
+ raise Error::ProbePreviouslyFailed, msg
101
+ end
102
+
103
+ begin
104
+ instrumenter.hook(probe, &method(:probe_executed_callback))
105
+
106
+ @installed_probes[probe.id] = probe
107
+ payload = probe_notification_builder.build_installed(probe)
108
+ probe_notifier_worker.add_status(payload)
109
+ # The probe would only be in the pending probes list if it was
110
+ # previously attempted to be installed and the target was not loaded.
111
+ # Always remove from pending list here because it makes the
112
+ # API smaller and shouldn't cause any actual problems.
113
+ @pending_probes.delete(probe.id)
114
+ true
115
+ rescue Error::DITargetNotDefined
116
+ @pending_probes[probe.id] = probe
117
+ false
118
+ end
119
+ rescue => exc
120
+ # In "propagate all exceptions" mode we will try to instrument again.
121
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
122
+
123
+ logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
124
+ telemetry&.report(exc, description: "Error processing probe configuration")
125
+ # TODO report probe as failed to agent since we won't attempt to
126
+ # install it again.
127
+
128
+ # TODO add top stack frame to message
129
+ @failed_probes[probe.id] = "#{exc.class}: #{exc}"
130
+
131
+ raise
132
+ end
133
+ end
134
+
135
+ # Removes probes with ids other than in the specified list.
136
+ #
137
+ # This method is meant to be invoked from remote config processor.
138
+ # Remote config contains the list of currently defined probes; any
139
+ # probes not in that list have been removed by user and should be
140
+ # de-instrumented from the application.
141
+ def remove_other_probes(probe_ids)
142
+ @lock.synchronize do
143
+ @pending_probes.values.each do |probe|
144
+ unless probe_ids.include?(probe.id)
145
+ @pending_probes.delete(probe.id)
146
+ end
147
+ end
148
+ @installed_probes.values.each do |probe|
149
+ unless probe_ids.include?(probe.id)
150
+ begin
151
+ instrumenter.unhook(probe)
152
+ # Only remove the probe from installed list if it was
153
+ # successfully de-instrumented. Active probes do incur overhead
154
+ # for the running application, and if the error is ephemeral
155
+ # we want to try removing the probe again at the next opportunity.
156
+ #
157
+ # TODO give up after some time?
158
+ @installed_probes.delete(probe.id)
159
+ rescue => exc
160
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
161
+ # Silence all exceptions?
162
+ # TODO should we propagate here and rescue upstream?
163
+ logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}")
164
+ telemetry&.report(exc, description: "Error removing probe #{probe.id}")
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+
171
+ # Installs pending method probes, if any, for the specified class.
172
+ #
173
+ # This method is meant to be called from the "end" trace point,
174
+ # which is invoked for each class definition.
175
+ private def install_pending_method_probes(cls)
176
+ @lock.synchronize do
177
+ # TODO search more efficiently than linearly
178
+ @pending_probes.each do |probe_id, probe|
179
+ if probe.method?
180
+ # TODO move this stringification elsewhere
181
+ if probe.type_name == cls.name
182
+ begin
183
+ # TODO is it OK to hook from trace point handler?
184
+ # TODO the class is now defined, but can hooking still fail?
185
+ instrumenter.hook(probe, &method(:probe_executed_callback))
186
+ @pending_probes.delete(probe.id)
187
+ break
188
+ rescue Error::DITargetNotDefined
189
+ # This should not happen... try installing again later?
190
+ rescue => exc
191
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
192
+
193
+ logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}")
194
+ telemetry&.report(exc, description: "Error installing probe after class is defined")
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ # Installs pending line probes, if any, for the file of the specified
203
+ # absolute path.
204
+ #
205
+ # This method is meant to be called from the script_compiled trace
206
+ # point, which is invoked for each required or loaded file
207
+ # (and also for eval'd code, but those invocations are filtered out).
208
+ def install_pending_line_probes(path)
209
+ @lock.synchronize do
210
+ @pending_probes.values.each do |probe|
211
+ if probe.line?
212
+ if probe.file_matches?(path)
213
+ add_probe(probe)
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
219
+
220
+ # Entry point invoked from the instrumentation when the specfied probe
221
+ # is invoked (that is, either its target method is invoked, or
222
+ # execution reached its target file/line).
223
+ #
224
+ # This method is responsible for queueing probe status to be sent to the
225
+ # backend (once per the probe's lifetime) and a snapshot corresponding
226
+ # to the current invocation.
227
+ def probe_executed_callback(probe:, **opts)
228
+ unless probe.emitting_notified?
229
+ payload = probe_notification_builder.build_emitting(probe)
230
+ probe_notifier_worker.add_status(payload)
231
+ probe.emitting_notified = true
232
+ end
233
+
234
+ payload = probe_notification_builder.build_executed(probe, **opts)
235
+ probe_notifier_worker.add_snapshot(payload)
236
+ end
237
+
238
+ # Class/module definition trace point (:end type).
239
+ # Used to install hooks when the target classes/modules aren't yet
240
+ # defined when the hook request is received.
241
+ attr_reader :definition_trace_point
242
+ end
243
+ end
244
+ end
245
+
246
+ # rubocop:enable Lint/AssignmentInCondition
@@ -46,11 +46,13 @@ module Datadog
46
46
  # this should be all frames for enriched probes and no frames for
47
47
  # non-enriched probes?
48
48
  build_snapshot(probe, rv: rv, snapshot: snapshot,
49
+ # Actual path of the instrumented file.
50
+ path: trace_point&.path,
49
51
  duration: duration, caller_locations: caller_locations, args: args, kwargs: kwargs,
50
52
  serialized_entry_args: serialized_entry_args)
51
53
  end
52
54
 
53
- def build_snapshot(probe, rv: nil, snapshot: nil,
55
+ def build_snapshot(probe, rv: nil, snapshot: nil, path: nil,
54
56
  duration: nil, caller_locations: nil, args: nil, kwargs: nil,
55
57
  serialized_entry_args: nil)
56
58
  # TODO also verify that non-capturing probe does not pass
@@ -85,18 +87,8 @@ module Datadog
85
87
  end
86
88
 
87
89
  location = if probe.line?
88
- actual_file = if probe.file
89
- # Normally caller_locations should always be filled for a line probe
90
- # but in the test suite we don't always provide all arguments.
91
- actual_file_basename = File.basename(probe.file)
92
- caller_locations&.detect do |loc|
93
- # TODO record actual path that probe was installed into,
94
- # perform exact match here against that path.
95
- File.basename(loc.path) == actual_file_basename
96
- end&.path || probe.file
97
- end
98
90
  {
99
- file: actual_file,
91
+ file: path,
100
92
  lines: [probe.line_no],
101
93
  }
102
94
  elsif probe.method?
@@ -23,12 +23,9 @@ module Datadog
23
23
  #
24
24
  # @api private
25
25
  class ProbeNotifierWorker
26
- # Minimum interval between submissions.
27
- # TODO make this into an internal setting and increase default to 2 or 3.
28
- MIN_SEND_INTERVAL = 1
29
-
30
- def initialize(settings, transport, logger)
26
+ def initialize(settings, transport, logger, telemetry: nil)
31
27
  @settings = settings
28
+ @telemetry = telemetry
32
29
  @status_queue = []
33
30
  @snapshot_queue = []
34
31
  @transport = transport
@@ -39,10 +36,12 @@ module Datadog
39
36
  @sleep_remaining = nil
40
37
  @wake_scheduled = false
41
38
  @thread = nil
39
+ @flush = 0
42
40
  end
43
41
 
44
42
  attr_reader :settings
45
43
  attr_reader :logger
44
+ attr_reader :telemetry
46
45
 
47
46
  def start
48
47
  return if @thread
@@ -53,33 +52,38 @@ module Datadog
53
52
  # and then quit?
54
53
  break if @stop_requested
55
54
 
56
- sleep_remaining = @lock.synchronize do
57
- if sleep_remaining && sleep_remaining > 0
58
- # Recalculate how much sleep time is remaining, then sleep that long.
59
- set_sleep_remaining
60
- else
61
- 0
55
+ # If a flush was requested, send immediately and do not
56
+ # wait for the cooldown period.
57
+ if @lock.synchronize { @flush } == 0
58
+ sleep_remaining = @lock.synchronize do
59
+ if sleep_remaining && sleep_remaining > 0
60
+ # Recalculate how much sleep time is remaining, then sleep that long.
61
+ set_sleep_remaining
62
+ else
63
+ 0
64
+ end
62
65
  end
63
- end
64
66
 
65
- if sleep_remaining > 0
66
- # Do not need to update @wake_scheduled here because
67
- # wake-up is already scheduled for the earliest possible time.
68
- wake.wait(sleep_remaining)
69
- next
67
+ if sleep_remaining > 0
68
+ # Do not need to update @wake_scheduled here because
69
+ # wake-up is already scheduled for the earliest possible time.
70
+ wake.wait(sleep_remaining)
71
+ next
72
+ end
70
73
  end
71
74
 
72
75
  begin
73
76
  more = maybe_send
74
77
  rescue => exc
75
- raise if settings.dynamic_instrumentation.propagate_all_exceptions
78
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
76
79
 
77
80
  logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
81
+ telemetry&.report(exc, description: "Error in probe notifier worker")
78
82
  end
79
83
  @lock.synchronize do
80
84
  @wake_scheduled = more
81
85
  end
82
- wake.wait(more ? MIN_SEND_INTERVAL : nil)
86
+ wake.wait(more ? min_send_interval : nil)
83
87
  end
84
88
  end
85
89
  end
@@ -106,26 +110,40 @@ module Datadog
106
110
  # therefore, it should only be called when there is no parallel
107
111
  # activity (in another thread) that causes more notifications
108
112
  # to be generated.
113
+ #
114
+ # This method is used by the test suite to wait until notifications have
115
+ # been sent out, and could be used for graceful stopping of the
116
+ # worker thread.
109
117
  def flush
110
- loop do
111
- if @thread.nil? || !@thread.alive?
112
- return
113
- end
118
+ @lock.synchronize do
119
+ @flush += 1
120
+ end
121
+ begin
122
+ loop do
123
+ if @thread.nil? || !@thread.alive?
124
+ return
125
+ end
114
126
 
115
- io_in_progress, queues_empty = @lock.synchronize do
116
- [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
117
- end
127
+ io_in_progress, queues_empty = @lock.synchronize do
128
+ [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
129
+ end
118
130
 
119
- if io_in_progress
120
- # If we just call Thread.pass we could be in a busy loop -
121
- # add a sleep.
122
- sleep 0.25
123
- next
124
- elsif queues_empty
125
- break
126
- else
127
- sleep 0.25
128
- next
131
+ if io_in_progress
132
+ # If we just call Thread.pass we could be in a busy loop -
133
+ # add a sleep.
134
+ sleep 0.25
135
+ next
136
+ elsif queues_empty
137
+ break
138
+ else
139
+ wake.signal
140
+ sleep 0.25
141
+ next
142
+ end
143
+ end
144
+ ensure
145
+ @lock.synchronize do
146
+ @flush -= 1
129
147
  end
130
148
  end
131
149
  end
@@ -136,6 +154,11 @@ module Datadog
136
154
  attr_reader :wake
137
155
  attr_reader :thread
138
156
 
157
+ # Convenience method to keep line length reasonable in the rest of the file.
158
+ def min_send_interval
159
+ settings.dynamic_instrumentation.internal.min_send_interval
160
+ end
161
+
139
162
  # This method should be called while @lock is held.
140
163
  def io_in_progress?
141
164
  @io_in_progress
@@ -181,14 +204,14 @@ module Datadog
181
204
  end
182
205
 
183
206
  # Determine how much longer the worker thread should sleep
184
- # so as not to send in less than MIN_SEND_INTERVAL since the last send.
207
+ # so as not to send in less than min send interval since the last send.
185
208
  # Important: this method must be called when @lock is held.
186
209
  #
187
210
  # Returns the time remaining to sleep.
188
211
  def set_sleep_remaining
189
212
  now = Core::Utils::Time.get_time
190
213
  @sleep_remaining = if last_sent
191
- [last_sent + MIN_SEND_INTERVAL - now, 0].max
214
+ [last_sent + min_send_interval - now, 0].max
192
215
  else
193
216
  0
194
217
  end
@@ -218,16 +241,20 @@ module Datadog
218
241
  @last_sent = time
219
242
  end
220
243
  rescue => exc
221
- raise if settings.dynamic_instrumentation.propagate_all_exceptions
244
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
222
245
  logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
246
+ # Should we report this error to telemetry? Most likely failure
247
+ # to send is due to a network issue, and trying to send a
248
+ # telemetry message would also fail.
223
249
  end
224
250
  end
225
251
  batch.any? # steep:ignore
226
- rescue ThreadError
252
+ rescue ThreadError => exc
227
253
  # Normally the queue should only be consumed in this method,
228
254
  # however if anyone consumes it elsewhere we don't want to block
229
255
  # while consuming it here. Rescue ThreadError and return.
230
- logger.warn("unexpected #{event_name} queue underflow - consumed elsewhere?")
256
+ logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?")
257
+ telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow")
231
258
  ensure
232
259
  @lock.synchronize do
233
260
  @io_in_progress = false