datadog 2.6.0 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,10 +54,11 @@ module Datadog
54
54
  #
55
55
  # @api private
56
56
  class Instrumenter
57
- def initialize(settings, serializer, logger, code_tracker: nil)
57
+ def initialize(settings, serializer, logger, code_tracker: nil, telemetry: nil)
58
58
  @settings = settings
59
59
  @serializer = serializer
60
60
  @logger = logger
61
+ @telemetry = telemetry
61
62
  @code_tracker = code_tracker
62
63
 
63
64
  @lock = Mutex.new
@@ -66,6 +67,7 @@ module Datadog
66
67
  attr_reader :settings
67
68
  attr_reader :serializer
68
69
  attr_reader :logger
70
+ attr_reader :telemetry
69
71
  attr_reader :code_tracker
70
72
 
71
73
  # This is a substitute for Thread::Backtrace::Location
@@ -172,12 +174,12 @@ module Datadog
172
174
  # we use mock objects and the methods may be mocked with
173
175
  # individual invocations, yielding different return values on
174
176
  # different calls to the same method.
175
- permit_untargeted_trace_points = settings.dynamic_instrumentation.untargeted_trace_points
177
+ permit_untargeted_trace_points = settings.dynamic_instrumentation.internal.untargeted_trace_points
176
178
 
177
179
  iseq = nil
178
180
  if code_tracker
179
- iseq = code_tracker.iseqs_for_path_suffix(probe.file).first # steep:ignore
180
- unless iseq
181
+ ret = code_tracker.iseqs_for_path_suffix(probe.file) # steep:ignore
182
+ unless ret
181
183
  if permit_untargeted_trace_points
182
184
  # Continue withoout targeting the trace point.
183
185
  # This is going to cause a serious performance penalty for
@@ -204,6 +206,10 @@ module Datadog
204
206
  raise Error::DITargetNotDefined, "File not in code tracker registry: #{probe.file}"
205
207
  end
206
208
 
209
+ if ret
210
+ actual_path, iseq = ret
211
+ end
212
+
207
213
  # If trace point is not targeted, we only need one trace point per file.
208
214
  # Creating a trace point for each probe does work but the performance
209
215
  # penalty will be taken for each trace point defined in the file.
@@ -217,18 +223,26 @@ module Datadog
217
223
  # this optimization just yet and create a trace point for each probe.
218
224
 
219
225
  tp = TracePoint.new(:line) do |tp|
220
- # If trace point is not targeted, we must verify that the invocation
221
- # is the file & line that we want, because untargeted trace points
222
- # are invoked for *each* line of Ruby executed.
223
- if iseq || tp.lineno == probe.line_no && probe.file_matches?(tp.path)
224
- if rate_limiter.nil? || rate_limiter.allow?
225
- # & is to stop steep complaints, block is always present here.
226
- block&.call(probe: probe, trace_point: tp, caller_locations: caller_locations)
226
+ begin
227
+ # If trace point is not targeted, we must verify that the invocation
228
+ # is the file & line that we want, because untargeted trace points
229
+ # are invoked for *each* line of Ruby executed.
230
+ if iseq || tp.lineno == probe.line_no && probe.file_matches?(tp.path)
231
+ if rate_limiter.nil? || rate_limiter.allow?
232
+ # & is to stop steep complaints, block is always present here.
233
+ block&.call(probe: probe, trace_point: tp, caller_locations: caller_locations)
234
+ end
227
235
  end
236
+ rescue => exc
237
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
238
+ logger.warn("Unhandled exception in line trace point: #{exc.class}: #{exc}")
239
+ telemetry&.report(exc, description: "Unhandled exception in line trace point")
240
+ # TODO test this path
228
241
  end
229
242
  rescue => exc
230
243
  raise if settings.dynamic_instrumentation.propagate_all_exceptions
231
244
  logger.warn("Unhandled exception in line trace point: #{exc.class}: #{exc}")
245
+ telemetry&.report(exc, description: "Unhandled exception in line trace point")
232
246
  # TODO test this path
233
247
  end
234
248
 
@@ -244,6 +258,8 @@ module Datadog
244
258
  end
245
259
 
246
260
  probe.instrumentation_trace_point = tp
261
+ # actual_path could be nil if we don't use targeted trace points.
262
+ probe.instrumented_path = actual_path
247
263
 
248
264
  if iseq
249
265
  tp.enable(target: iseq, target_line: line_no)
@@ -47,6 +47,10 @@ module Datadog
47
47
  raise ArgumentError, "Probe contains both line number and method name: #{id}"
48
48
  end
49
49
 
50
+ if line_no && !file
51
+ raise ArgumentError, "Probe contains line number but not file: #{id}"
52
+ end
53
+
50
54
  if type_name && !method_name || method_name && !type_name
51
55
  raise ArgumentError, "Partial method probe definition: #{id}"
52
56
  end
@@ -71,6 +75,8 @@ module Datadog
71
75
 
72
76
  @rate_limit = rate_limit || (@capture_snapshot ? 1 : 5000)
73
77
  @rate_limiter = Datadog::Core::TokenBucket.new(@rate_limit)
78
+
79
+ @emitting_notified = false
74
80
  end
75
81
 
76
82
  attr_reader :id
@@ -101,7 +107,10 @@ module Datadog
101
107
  # method or for stack traversal purposes?), therefore we do not check
102
108
  # for file name/path presence here and just consider the line number.
103
109
  def line?
104
- !line_no.nil?
110
+ # Constructor checks that file is given if line number is given,
111
+ # but for safety, check again here since we somehow got a probe with
112
+ # a line number but no file in the wild.
113
+ !!(file && line_no)
105
114
  end
106
115
 
107
116
  # Returns whether the probe is a method probe.
@@ -157,6 +166,19 @@ module Datadog
157
166
  # Line trace point for line probes. Normally this would be a targeted
158
167
  # trace point.
159
168
  attr_accessor :instrumentation_trace_point
169
+
170
+ # Actual path to the file instrumented by the probe, for line probes,
171
+ # when code tracking is available and line trace point is targeted.
172
+ # For untargeted line trace points instrumented path will be nil.
173
+ attr_accessor :instrumented_path
174
+
175
+ # TODO emitting_notified reads and writes should in theory be locked,
176
+ # however since DI is only implemented for MRI in practice the missing
177
+ # locking should not cause issues.
178
+ attr_writer :emitting_notified
179
+ def emitting_notified?
180
+ !!@emitting_notified
181
+ end
160
182
  end
161
183
  end
162
184
  end
@@ -0,0 +1,246 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Lint/AssignmentInCondition
4
+
5
+ require 'monitor'
6
+
7
+ module Datadog
8
+ module DI
9
+ # Stores probes received from remote config (that we can parse, in other
10
+ # words, whose type/attributes we support), requests needed instrumentation
11
+ # for the probes via Instrumenter, and stores pending probes (those which
12
+ # haven't yet been instrumented successfully due to their targets not
13
+ # existing) and failed probes (where we are certain the target will not
14
+ # ever be loaded, or otherwise become valid).
15
+ #
16
+ # @api private
17
+ class ProbeManager
18
+ def initialize(settings, instrumenter, probe_notification_builder,
19
+ probe_notifier_worker, logger, telemetry: nil)
20
+ @settings = settings
21
+ @instrumenter = instrumenter
22
+ @probe_notification_builder = probe_notification_builder
23
+ @probe_notifier_worker = probe_notifier_worker
24
+ @logger = logger
25
+ @telemetry = telemetry
26
+ @installed_probes = {}
27
+ @pending_probes = {}
28
+ @failed_probes = {}
29
+ @lock = Monitor.new
30
+
31
+ @definition_trace_point = TracePoint.trace(:end) do |tp|
32
+ install_pending_method_probes(tp.self)
33
+ rescue => exc
34
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
35
+ logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}")
36
+ telemetry&.report(exc, description: "Unhandled exception in definition trace point")
37
+ # TODO test this path
38
+ end
39
+ end
40
+
41
+ attr_reader :logger
42
+ attr_reader :telemetry
43
+
44
+ # TODO test that close is called during component teardown and
45
+ # the trace point is cleared
46
+ def close
47
+ definition_trace_point.disable
48
+ clear_hooks
49
+ end
50
+
51
+ def clear_hooks
52
+ @lock.synchronize do
53
+ @pending_probes.clear
54
+ @installed_probes.each do |probe_id, probe|
55
+ instrumenter.unhook(probe)
56
+ end
57
+ @installed_probes.clear
58
+ end
59
+ end
60
+
61
+ attr_reader :settings
62
+ attr_reader :instrumenter
63
+ attr_reader :probe_notification_builder
64
+ attr_reader :probe_notifier_worker
65
+
66
+ def installed_probes
67
+ @lock.synchronize do
68
+ @installed_probes
69
+ end
70
+ end
71
+
72
+ def pending_probes
73
+ @lock.synchronize do
74
+ @pending_probes
75
+ end
76
+ end
77
+
78
+ # Probes that failed to instrument for reasons other than the target is
79
+ # not yet loaded are added to this collection, so that we do not try
80
+ # to instrument them every time remote configuration is processed.
81
+ def failed_probes
82
+ @lock.synchronize do
83
+ @failed_probes
84
+ end
85
+ end
86
+
87
+ # Requests to install the specified probe.
88
+ #
89
+ # If the target of the probe does not exist, assume the relevant
90
+ # code is not loaded yet (rather than that it will never be loaded),
91
+ # and store the probe in a pending probe list. When classes are
92
+ # defined, or files loaded, the probe will be checked against the
93
+ # newly defined classes/loaded files, and will be installed if it
94
+ # matches.
95
+ def add_probe(probe)
96
+ @lock.synchronize do
97
+ # Probe failed to install previously, do not try to install it again.
98
+ if msg = @failed_probes[probe.id]
99
+ # TODO test this path
100
+ raise Error::ProbePreviouslyFailed, msg
101
+ end
102
+
103
+ begin
104
+ instrumenter.hook(probe, &method(:probe_executed_callback))
105
+
106
+ @installed_probes[probe.id] = probe
107
+ payload = probe_notification_builder.build_installed(probe)
108
+ probe_notifier_worker.add_status(payload)
109
+ # The probe would only be in the pending probes list if it was
110
+ # previously attempted to be installed and the target was not loaded.
111
+ # Always remove from pending list here because it makes the
112
+ # API smaller and shouldn't cause any actual problems.
113
+ @pending_probes.delete(probe.id)
114
+ true
115
+ rescue Error::DITargetNotDefined
116
+ @pending_probes[probe.id] = probe
117
+ false
118
+ end
119
+ rescue => exc
120
+ # In "propagate all exceptions" mode we will try to instrument again.
121
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
122
+
123
+ logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
124
+ telemetry&.report(exc, description: "Error processing probe configuration")
125
+ # TODO report probe as failed to agent since we won't attempt to
126
+ # install it again.
127
+
128
+ # TODO add top stack frame to message
129
+ @failed_probes[probe.id] = "#{exc.class}: #{exc}"
130
+
131
+ raise
132
+ end
133
+ end
134
+
135
+ # Removes probes with ids other than in the specified list.
136
+ #
137
+ # This method is meant to be invoked from remote config processor.
138
+ # Remote config contains the list of currently defined probes; any
139
+ # probes not in that list have been removed by user and should be
140
+ # de-instrumented from the application.
141
+ def remove_other_probes(probe_ids)
142
+ @lock.synchronize do
143
+ @pending_probes.values.each do |probe|
144
+ unless probe_ids.include?(probe.id)
145
+ @pending_probes.delete(probe.id)
146
+ end
147
+ end
148
+ @installed_probes.values.each do |probe|
149
+ unless probe_ids.include?(probe.id)
150
+ begin
151
+ instrumenter.unhook(probe)
152
+ # Only remove the probe from installed list if it was
153
+ # successfully de-instrumented. Active probes do incur overhead
154
+ # for the running application, and if the error is ephemeral
155
+ # we want to try removing the probe again at the next opportunity.
156
+ #
157
+ # TODO give up after some time?
158
+ @installed_probes.delete(probe.id)
159
+ rescue => exc
160
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
161
+ # Silence all exceptions?
162
+ # TODO should we propagate here and rescue upstream?
163
+ logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}")
164
+ telemetry&.report(exc, description: "Error removing probe #{probe.id}")
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+
171
+ # Installs pending method probes, if any, for the specified class.
172
+ #
173
+ # This method is meant to be called from the "end" trace point,
174
+ # which is invoked for each class definition.
175
+ private def install_pending_method_probes(cls)
176
+ @lock.synchronize do
177
+ # TODO search more efficiently than linearly
178
+ @pending_probes.each do |probe_id, probe|
179
+ if probe.method?
180
+ # TODO move this stringification elsewhere
181
+ if probe.type_name == cls.name
182
+ begin
183
+ # TODO is it OK to hook from trace point handler?
184
+ # TODO the class is now defined, but can hooking still fail?
185
+ instrumenter.hook(probe, &method(:probe_executed_callback))
186
+ @pending_probes.delete(probe.id)
187
+ break
188
+ rescue Error::DITargetNotDefined
189
+ # This should not happen... try installing again later?
190
+ rescue => exc
191
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
192
+
193
+ logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}")
194
+ telemetry&.report(exc, description: "Error installing probe after class is defined")
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ # Installs pending line probes, if any, for the file of the specified
203
+ # absolute path.
204
+ #
205
+ # This method is meant to be called from the script_compiled trace
206
+ # point, which is invoked for each required or loaded file
207
+ # (and also for eval'd code, but those invocations are filtered out).
208
+ def install_pending_line_probes(path)
209
+ @lock.synchronize do
210
+ @pending_probes.values.each do |probe|
211
+ if probe.line?
212
+ if probe.file_matches?(path)
213
+ add_probe(probe)
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
219
+
220
+ # Entry point invoked from the instrumentation when the specfied probe
221
+ # is invoked (that is, either its target method is invoked, or
222
+ # execution reached its target file/line).
223
+ #
224
+ # This method is responsible for queueing probe status to be sent to the
225
+ # backend (once per the probe's lifetime) and a snapshot corresponding
226
+ # to the current invocation.
227
+ def probe_executed_callback(probe:, **opts)
228
+ unless probe.emitting_notified?
229
+ payload = probe_notification_builder.build_emitting(probe)
230
+ probe_notifier_worker.add_status(payload)
231
+ probe.emitting_notified = true
232
+ end
233
+
234
+ payload = probe_notification_builder.build_executed(probe, **opts)
235
+ probe_notifier_worker.add_snapshot(payload)
236
+ end
237
+
238
+ # Class/module definition trace point (:end type).
239
+ # Used to install hooks when the target classes/modules aren't yet
240
+ # defined when the hook request is received.
241
+ attr_reader :definition_trace_point
242
+ end
243
+ end
244
+ end
245
+
246
+ # rubocop:enable Lint/AssignmentInCondition
@@ -46,11 +46,13 @@ module Datadog
46
46
  # this should be all frames for enriched probes and no frames for
47
47
  # non-enriched probes?
48
48
  build_snapshot(probe, rv: rv, snapshot: snapshot,
49
+ # Actual path of the instrumented file.
50
+ path: trace_point&.path,
49
51
  duration: duration, caller_locations: caller_locations, args: args, kwargs: kwargs,
50
52
  serialized_entry_args: serialized_entry_args)
51
53
  end
52
54
 
53
- def build_snapshot(probe, rv: nil, snapshot: nil,
55
+ def build_snapshot(probe, rv: nil, snapshot: nil, path: nil,
54
56
  duration: nil, caller_locations: nil, args: nil, kwargs: nil,
55
57
  serialized_entry_args: nil)
56
58
  # TODO also verify that non-capturing probe does not pass
@@ -85,18 +87,8 @@ module Datadog
85
87
  end
86
88
 
87
89
  location = if probe.line?
88
- actual_file = if probe.file
89
- # Normally caller_locations should always be filled for a line probe
90
- # but in the test suite we don't always provide all arguments.
91
- actual_file_basename = File.basename(probe.file)
92
- caller_locations&.detect do |loc|
93
- # TODO record actual path that probe was installed into,
94
- # perform exact match here against that path.
95
- File.basename(loc.path) == actual_file_basename
96
- end&.path || probe.file
97
- end
98
90
  {
99
- file: actual_file,
91
+ file: path,
100
92
  lines: [probe.line_no],
101
93
  }
102
94
  elsif probe.method?
@@ -23,12 +23,9 @@ module Datadog
23
23
  #
24
24
  # @api private
25
25
  class ProbeNotifierWorker
26
- # Minimum interval between submissions.
27
- # TODO make this into an internal setting and increase default to 2 or 3.
28
- MIN_SEND_INTERVAL = 1
29
-
30
- def initialize(settings, transport, logger)
26
+ def initialize(settings, transport, logger, telemetry: nil)
31
27
  @settings = settings
28
+ @telemetry = telemetry
32
29
  @status_queue = []
33
30
  @snapshot_queue = []
34
31
  @transport = transport
@@ -39,10 +36,12 @@ module Datadog
39
36
  @sleep_remaining = nil
40
37
  @wake_scheduled = false
41
38
  @thread = nil
39
+ @flush = 0
42
40
  end
43
41
 
44
42
  attr_reader :settings
45
43
  attr_reader :logger
44
+ attr_reader :telemetry
46
45
 
47
46
  def start
48
47
  return if @thread
@@ -53,33 +52,38 @@ module Datadog
53
52
  # and then quit?
54
53
  break if @stop_requested
55
54
 
56
- sleep_remaining = @lock.synchronize do
57
- if sleep_remaining && sleep_remaining > 0
58
- # Recalculate how much sleep time is remaining, then sleep that long.
59
- set_sleep_remaining
60
- else
61
- 0
55
+ # If a flush was requested, send immediately and do not
56
+ # wait for the cooldown period.
57
+ if @lock.synchronize { @flush } == 0
58
+ sleep_remaining = @lock.synchronize do
59
+ if sleep_remaining && sleep_remaining > 0
60
+ # Recalculate how much sleep time is remaining, then sleep that long.
61
+ set_sleep_remaining
62
+ else
63
+ 0
64
+ end
62
65
  end
63
- end
64
66
 
65
- if sleep_remaining > 0
66
- # Do not need to update @wake_scheduled here because
67
- # wake-up is already scheduled for the earliest possible time.
68
- wake.wait(sleep_remaining)
69
- next
67
+ if sleep_remaining > 0
68
+ # Do not need to update @wake_scheduled here because
69
+ # wake-up is already scheduled for the earliest possible time.
70
+ wake.wait(sleep_remaining)
71
+ next
72
+ end
70
73
  end
71
74
 
72
75
  begin
73
76
  more = maybe_send
74
77
  rescue => exc
75
- raise if settings.dynamic_instrumentation.propagate_all_exceptions
78
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
76
79
 
77
80
  logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
81
+ telemetry&.report(exc, description: "Error in probe notifier worker")
78
82
  end
79
83
  @lock.synchronize do
80
84
  @wake_scheduled = more
81
85
  end
82
- wake.wait(more ? MIN_SEND_INTERVAL : nil)
86
+ wake.wait(more ? min_send_interval : nil)
83
87
  end
84
88
  end
85
89
  end
@@ -106,26 +110,40 @@ module Datadog
106
110
  # therefore, it should only be called when there is no parallel
107
111
  # activity (in another thread) that causes more notifications
108
112
  # to be generated.
113
+ #
114
+ # This method is used by the test suite to wait until notifications have
115
+ # been sent out, and could be used for graceful stopping of the
116
+ # worker thread.
109
117
  def flush
110
- loop do
111
- if @thread.nil? || !@thread.alive?
112
- return
113
- end
118
+ @lock.synchronize do
119
+ @flush += 1
120
+ end
121
+ begin
122
+ loop do
123
+ if @thread.nil? || !@thread.alive?
124
+ return
125
+ end
114
126
 
115
- io_in_progress, queues_empty = @lock.synchronize do
116
- [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
117
- end
127
+ io_in_progress, queues_empty = @lock.synchronize do
128
+ [io_in_progress?, status_queue.empty? && snapshot_queue.empty?]
129
+ end
118
130
 
119
- if io_in_progress
120
- # If we just call Thread.pass we could be in a busy loop -
121
- # add a sleep.
122
- sleep 0.25
123
- next
124
- elsif queues_empty
125
- break
126
- else
127
- sleep 0.25
128
- next
131
+ if io_in_progress
132
+ # If we just call Thread.pass we could be in a busy loop -
133
+ # add a sleep.
134
+ sleep 0.25
135
+ next
136
+ elsif queues_empty
137
+ break
138
+ else
139
+ wake.signal
140
+ sleep 0.25
141
+ next
142
+ end
143
+ end
144
+ ensure
145
+ @lock.synchronize do
146
+ @flush -= 1
129
147
  end
130
148
  end
131
149
  end
@@ -136,6 +154,11 @@ module Datadog
136
154
  attr_reader :wake
137
155
  attr_reader :thread
138
156
 
157
+ # Convenience method to keep line length reasonable in the rest of the file.
158
+ def min_send_interval
159
+ settings.dynamic_instrumentation.internal.min_send_interval
160
+ end
161
+
139
162
  # This method should be called while @lock is held.
140
163
  def io_in_progress?
141
164
  @io_in_progress
@@ -181,14 +204,14 @@ module Datadog
181
204
  end
182
205
 
183
206
  # Determine how much longer the worker thread should sleep
184
- # so as not to send in less than MIN_SEND_INTERVAL since the last send.
207
+ # so as not to send in less than min send interval since the last send.
185
208
  # Important: this method must be called when @lock is held.
186
209
  #
187
210
  # Returns the time remaining to sleep.
188
211
  def set_sleep_remaining
189
212
  now = Core::Utils::Time.get_time
190
213
  @sleep_remaining = if last_sent
191
- [last_sent + MIN_SEND_INTERVAL - now, 0].max
214
+ [last_sent + min_send_interval - now, 0].max
192
215
  else
193
216
  0
194
217
  end
@@ -218,16 +241,20 @@ module Datadog
218
241
  @last_sent = time
219
242
  end
220
243
  rescue => exc
221
- raise if settings.dynamic_instrumentation.propagate_all_exceptions
244
+ raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
222
245
  logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})")
246
+ # Should we report this error to telemetry? Most likely failure
247
+ # to send is due to a network issue, and trying to send a
248
+ # telemetry message would also fail.
223
249
  end
224
250
  end
225
251
  batch.any? # steep:ignore
226
- rescue ThreadError
252
+ rescue ThreadError => exc
227
253
  # Normally the queue should only be consumed in this method,
228
254
  # however if anyone consumes it elsewhere we don't want to block
229
255
  # while consuming it here. Rescue ThreadError and return.
230
- logger.warn("unexpected #{event_name} queue underflow - consumed elsewhere?")
256
+ logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?")
257
+ telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow")
231
258
  ensure
232
259
  @lock.synchronize do
233
260
  @io_in_progress = false