ddtrace 1.5.2 → 1.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +44 -1
- data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +9 -2
- data/ext/ddtrace_profiling_loader/extconf.rb +17 -0
- data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +38 -2
- data/ext/ddtrace_profiling_native_extension/clock_id.h +1 -0
- data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +1 -0
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +517 -42
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +3 -0
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +208 -30
- data/ext/ddtrace_profiling_native_extension/collectors_stack.c +156 -46
- data/ext/ddtrace_profiling_native_extension/collectors_stack.h +11 -2
- data/ext/ddtrace_profiling_native_extension/extconf.rb +11 -1
- data/ext/ddtrace_profiling_native_extension/http_transport.c +83 -64
- data/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h +4 -4
- data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +3 -2
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +59 -0
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
- data/ext/ddtrace_profiling_native_extension/profiling.c +10 -0
- data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +0 -1
- data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +4 -2
- data/ext/ddtrace_profiling_native_extension/stack_recorder.c +45 -29
- data/ext/ddtrace_profiling_native_extension/stack_recorder.h +7 -7
- data/lib/datadog/appsec/contrib/rack/request_middleware.rb +4 -0
- data/lib/datadog/appsec/event.rb +6 -0
- data/lib/datadog/core/configuration/components.rb +20 -14
- data/lib/datadog/core/configuration/settings.rb +42 -4
- data/lib/datadog/core/diagnostics/environment_logger.rb +5 -1
- data/lib/datadog/core/utils/compression.rb +5 -1
- data/lib/datadog/core.rb +0 -54
- data/lib/datadog/profiling/collectors/cpu_and_wall_time.rb +12 -2
- data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +5 -3
- data/lib/datadog/profiling/exporter.rb +2 -4
- data/lib/datadog/profiling/http_transport.rb +1 -1
- data/lib/datadog/tracing/configuration/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/aws/instrumentation.rb +2 -0
- data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/dalli/instrumentation.rb +4 -0
- data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +2 -0
- data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +3 -0
- data/lib/datadog/tracing/contrib/ethon/easy_patch.rb +2 -0
- data/lib/datadog/tracing/contrib/ethon/multi_patch.rb +2 -0
- data/lib/datadog/tracing/contrib/excon/middleware.rb +2 -0
- data/lib/datadog/tracing/contrib/ext.rb +6 -0
- data/lib/datadog/tracing/contrib/faraday/middleware.rb +2 -0
- data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +5 -0
- data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/server.rb +7 -1
- data/lib/datadog/tracing/contrib/grpc/ext.rb +2 -0
- data/lib/datadog/tracing/contrib/hanami/action_tracer.rb +47 -0
- data/lib/datadog/tracing/contrib/hanami/configuration/settings.rb +22 -0
- data/lib/datadog/tracing/contrib/hanami/ext.rb +24 -0
- data/lib/datadog/tracing/contrib/hanami/integration.rb +44 -0
- data/lib/datadog/tracing/contrib/hanami/patcher.rb +33 -0
- data/lib/datadog/tracing/contrib/hanami/plugin.rb +23 -0
- data/lib/datadog/tracing/contrib/hanami/renderer_policy_tracing.rb +41 -0
- data/lib/datadog/tracing/contrib/hanami/router_tracing.rb +44 -0
- data/lib/datadog/tracing/contrib/http/instrumentation.rb +2 -0
- data/lib/datadog/tracing/contrib/httpclient/instrumentation.rb +2 -0
- data/lib/datadog/tracing/contrib/httprb/instrumentation.rb +2 -0
- data/lib/datadog/tracing/contrib/mongodb/ext.rb +7 -0
- data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +4 -0
- data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +12 -0
- data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +16 -0
- data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +12 -0
- data/lib/datadog/tracing/contrib/pg/ext.rb +2 -1
- data/lib/datadog/tracing/contrib/pg/instrumentation.rb +34 -18
- data/lib/datadog/tracing/contrib/propagation/sql_comment/comment.rb +43 -0
- data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +32 -0
- data/lib/datadog/tracing/contrib/propagation/sql_comment/mode.rb +28 -0
- data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +49 -0
- data/lib/datadog/tracing/contrib/rack/middlewares.rb +11 -5
- data/lib/datadog/tracing/contrib/redis/ext.rb +2 -0
- data/lib/datadog/tracing/contrib/redis/instrumentation.rb +4 -2
- data/lib/datadog/tracing/contrib/redis/patcher.rb +41 -0
- data/lib/datadog/tracing/contrib/redis/tags.rb +5 -0
- data/lib/datadog/tracing/contrib/rest_client/request_patch.rb +2 -0
- data/lib/datadog/tracing/contrib/sinatra/env.rb +12 -23
- data/lib/datadog/tracing/contrib/sinatra/ext.rb +7 -3
- data/lib/datadog/tracing/contrib/sinatra/patcher.rb +2 -2
- data/lib/datadog/tracing/contrib/sinatra/tracer.rb +8 -80
- data/lib/datadog/tracing/contrib/sinatra/tracer_middleware.rb +14 -9
- data/lib/datadog/tracing/contrib.rb +1 -0
- data/lib/datadog/tracing/distributed/datadog_tags_codec.rb +84 -0
- data/lib/datadog/tracing/distributed/headers/datadog.rb +122 -30
- data/lib/datadog/tracing/distributed/headers/ext.rb +2 -0
- data/lib/datadog/tracing/flush.rb +1 -1
- data/lib/datadog/tracing/metadata/ext.rb +8 -0
- data/lib/datadog/tracing/propagation/http.rb +9 -1
- data/lib/datadog/tracing/sampling/ext.rb +31 -0
- data/lib/datadog/tracing/sampling/priority_sampler.rb +46 -4
- data/lib/datadog/tracing/sampling/rate_by_key_sampler.rb +8 -9
- data/lib/datadog/tracing/sampling/rate_by_service_sampler.rb +29 -5
- data/lib/datadog/tracing/sampling/rate_sampler.rb +10 -3
- data/lib/datadog/tracing/sampling/rule_sampler.rb +4 -3
- data/lib/datadog/tracing/sampling/span/ext.rb +0 -4
- data/lib/datadog/tracing/sampling/span/rule.rb +1 -1
- data/lib/datadog/tracing/sampling/span/sampler.rb +14 -3
- data/lib/datadog/tracing/trace_digest.rb +3 -0
- data/lib/datadog/tracing/trace_operation.rb +10 -0
- data/lib/datadog/tracing/trace_segment.rb +6 -0
- data/lib/datadog/tracing/tracer.rb +3 -1
- data/lib/datadog/tracing/writer.rb +7 -0
- data/lib/ddtrace/transport/trace_formatter.rb +7 -0
- data/lib/ddtrace/transport/traces.rb +1 -1
- data/lib/ddtrace/version.rb +2 -2
- metadata +18 -14
- data/lib/datadog/profiling/old_ext.rb +0 -42
- data/lib/datadog/profiling/transport/http/api/endpoint.rb +0 -85
- data/lib/datadog/profiling/transport/http/api/instance.rb +0 -38
- data/lib/datadog/profiling/transport/http/api/spec.rb +0 -42
- data/lib/datadog/profiling/transport/http/api.rb +0 -45
- data/lib/datadog/profiling/transport/http/builder.rb +0 -30
- data/lib/datadog/profiling/transport/http/client.rb +0 -37
- data/lib/datadog/profiling/transport/http/response.rb +0 -21
- data/lib/datadog/profiling/transport/http.rb +0 -118
@@ -3,4 +3,7 @@
|
|
3
3
|
#include <ruby.h>
|
4
4
|
|
5
5
|
VALUE cpu_and_wall_time_collector_sample(VALUE self_instance);
|
6
|
+
VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance);
|
7
|
+
void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance);
|
8
|
+
void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance);
|
6
9
|
VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object);
|
@@ -64,18 +64,28 @@ struct cpu_and_wall_time_worker_state {
|
|
64
64
|
// telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
|
65
65
|
// atomic operations. stdatomic.h seems a nice thing to reach out for.
|
66
66
|
volatile bool should_run;
|
67
|
-
|
67
|
+
bool gc_profiling_enabled;
|
68
68
|
VALUE cpu_and_wall_time_collector_instance;
|
69
|
+
|
69
70
|
// When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
|
70
71
|
// the CpuAndWallTimeWorker thread
|
71
72
|
VALUE failure_exception;
|
73
|
+
|
74
|
+
// Used to get gc start/finish information
|
75
|
+
VALUE gc_tracepoint;
|
72
76
|
};
|
73
77
|
|
74
78
|
static VALUE _native_new(VALUE klass);
|
75
|
-
static VALUE _native_initialize(
|
79
|
+
static VALUE _native_initialize(
|
80
|
+
DDTRACE_UNUSED VALUE _self,
|
81
|
+
VALUE self_instance,
|
82
|
+
VALUE cpu_and_wall_time_collector_instance,
|
83
|
+
VALUE gc_profiling_enabled
|
84
|
+
);
|
76
85
|
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
|
77
86
|
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
|
78
87
|
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
|
88
|
+
static VALUE stop(VALUE self_instance, VALUE optional_exception);
|
79
89
|
static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
|
80
90
|
static void remove_sigprof_signal_handler(void);
|
81
91
|
static void block_sigprof_signal_handler_from_running_in_current_thread(void);
|
@@ -90,6 +100,13 @@ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
|
|
90
100
|
static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
|
91
101
|
static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
|
92
102
|
static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
|
103
|
+
static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
|
104
|
+
static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
|
105
|
+
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
|
106
|
+
static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
|
107
|
+
static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
|
108
|
+
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
|
109
|
+
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
|
93
110
|
|
94
111
|
// Global state -- be very careful when accessing or modifying it
|
95
112
|
|
@@ -119,13 +136,17 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
|
|
119
136
|
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
|
120
137
|
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
|
121
138
|
|
122
|
-
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize,
|
139
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
|
123
140
|
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
|
124
141
|
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
|
125
142
|
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
|
126
143
|
rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
|
127
144
|
rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
|
128
145
|
rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
|
146
|
+
rb_define_singleton_method(testing_module, "_native_trigger_sample", _native_trigger_sample, 0);
|
147
|
+
rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
|
148
|
+
rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
|
149
|
+
rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
|
129
150
|
}
|
130
151
|
|
131
152
|
// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
|
@@ -145,17 +166,28 @@ static VALUE _native_new(VALUE klass) {
|
|
145
166
|
struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
|
146
167
|
|
147
168
|
state->should_run = false;
|
169
|
+
state->gc_profiling_enabled = false;
|
148
170
|
state->cpu_and_wall_time_collector_instance = Qnil;
|
149
171
|
state->failure_exception = Qnil;
|
172
|
+
state->gc_tracepoint = Qnil;
|
150
173
|
|
151
174
|
return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
|
152
175
|
}
|
153
176
|
|
154
|
-
static VALUE _native_initialize(
|
177
|
+
static VALUE _native_initialize(
|
178
|
+
DDTRACE_UNUSED VALUE _self,
|
179
|
+
VALUE self_instance,
|
180
|
+
VALUE cpu_and_wall_time_collector_instance,
|
181
|
+
VALUE gc_profiling_enabled
|
182
|
+
) {
|
183
|
+
ENFORCE_BOOLEAN(gc_profiling_enabled);
|
184
|
+
|
155
185
|
struct cpu_and_wall_time_worker_state *state;
|
156
186
|
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
157
187
|
|
188
|
+
state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
|
158
189
|
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
|
190
|
+
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
|
159
191
|
|
160
192
|
return Qtrue;
|
161
193
|
}
|
@@ -166,6 +198,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
|
|
166
198
|
|
167
199
|
rb_gc_mark(state->cpu_and_wall_time_collector_instance);
|
168
200
|
rb_gc_mark(state->failure_exception);
|
201
|
+
rb_gc_mark(state->gc_tracepoint);
|
169
202
|
}
|
170
203
|
|
171
204
|
// Called in a background thread created in CpuAndWallTimeWorker#start
|
@@ -173,11 +206,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
173
206
|
struct cpu_and_wall_time_worker_state *state;
|
174
207
|
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
175
208
|
|
176
|
-
if (active_sampler_owner_thread != Qnil
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
209
|
+
if (active_sampler_owner_thread != Qnil) {
|
210
|
+
if (is_thread_alive(active_sampler_owner_thread)) {
|
211
|
+
rb_raise(
|
212
|
+
rb_eRuntimeError,
|
213
|
+
"Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
|
214
|
+
);
|
215
|
+
} else {
|
216
|
+
// The previously active thread seems to have died without cleaning up after itself.
|
217
|
+
// In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
|
218
|
+
// first as:
|
219
|
+
// a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
|
220
|
+
// being kept around
|
221
|
+
// b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
|
222
|
+
// enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
|
223
|
+
|
224
|
+
struct cpu_and_wall_time_worker_state *old_state;
|
225
|
+
TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
|
226
|
+
rb_tracepoint_disable(old_state->gc_tracepoint);
|
227
|
+
}
|
181
228
|
}
|
182
229
|
|
183
230
|
// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
|
@@ -189,6 +236,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
189
236
|
block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
|
190
237
|
|
191
238
|
install_sigprof_signal_handler(handle_sampling_signal);
|
239
|
+
if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
|
192
240
|
|
193
241
|
// Release GVL, get to the actual work!
|
194
242
|
int exception_state;
|
@@ -196,6 +244,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
196
244
|
|
197
245
|
// The sample trigger loop finished (either cleanly or with an error); let's clean up
|
198
246
|
|
247
|
+
rb_tracepoint_disable(state->gc_tracepoint);
|
199
248
|
remove_sigprof_signal_handler();
|
200
249
|
active_sampler_instance = Qnil;
|
201
250
|
active_sampler_owner_thread = Qnil;
|
@@ -209,10 +258,18 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
209
258
|
}
|
210
259
|
|
211
260
|
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
|
261
|
+
return stop(self_instance, /* optional_exception: */ Qnil);
|
262
|
+
}
|
263
|
+
|
264
|
+
static VALUE stop(VALUE self_instance, VALUE optional_exception) {
|
212
265
|
struct cpu_and_wall_time_worker_state *state;
|
213
266
|
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
214
267
|
|
215
268
|
state->should_run = false;
|
269
|
+
state->failure_exception = optional_exception;
|
270
|
+
|
271
|
+
// Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
|
272
|
+
rb_tracepoint_disable(state->gc_tracepoint);
|
216
273
|
|
217
274
|
return Qtrue;
|
218
275
|
}
|
@@ -264,10 +321,16 @@ static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
|
|
264
321
|
pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
|
265
322
|
}
|
266
323
|
|
324
|
+
// NOTE: Remember that this will run in the thread and within the scope of user code, including user C code.
|
325
|
+
// We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
|
326
|
+
// we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
|
267
327
|
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
268
328
|
if (!ruby_thread_has_gvl_p()) {
|
269
329
|
return; // Not safe to enqueue a sample from this thread
|
270
330
|
}
|
331
|
+
if (!ddtrace_rb_ractor_main_p()) {
|
332
|
+
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
333
|
+
}
|
271
334
|
|
272
335
|
// We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
|
273
336
|
// a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
|
@@ -315,34 +378,23 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
|
315
378
|
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
|
316
379
|
if (instance == Qnil) return;
|
317
380
|
|
381
|
+
// @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
|
382
|
+
// it's running on the main Ractor, but just in case...
|
383
|
+
if (!ddtrace_rb_ractor_main_p()) {
|
384
|
+
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
385
|
+
}
|
386
|
+
|
318
387
|
struct cpu_and_wall_time_worker_state *state;
|
319
388
|
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
320
389
|
|
321
390
|
// Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
|
322
|
-
|
323
|
-
VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
|
324
|
-
VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
|
325
|
-
VALUE exception_handler_function_arg = instance;
|
326
|
-
rb_rescue2(
|
327
|
-
function_to_call_safely,
|
328
|
-
function_to_call_safely_arg,
|
329
|
-
exception_handler_function,
|
330
|
-
exception_handler_function_arg,
|
331
|
-
rb_eException, // rb_eException is the base class of all Ruby exceptions
|
332
|
-
0 // Required by API to be the last argument
|
333
|
-
);
|
391
|
+
safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
|
334
392
|
}
|
335
393
|
|
336
|
-
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
|
337
|
-
struct cpu_and_wall_time_worker_state *state;
|
338
|
-
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
339
|
-
|
340
|
-
state->should_run = false;
|
341
|
-
state->failure_exception = exception;
|
342
|
-
|
343
|
-
return Qnil;
|
344
|
-
}
|
394
|
+
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
|
345
395
|
|
396
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
397
|
+
// It SHOULD NOT be used for other purposes.
|
346
398
|
static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
|
347
399
|
struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
|
348
400
|
if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
|
@@ -370,6 +422,8 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
|
|
370
422
|
return Qnil;
|
371
423
|
}
|
372
424
|
|
425
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
426
|
+
// It SHOULD NOT be used for other purposes.
|
373
427
|
static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
|
374
428
|
return \
|
375
429
|
(active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
|
@@ -380,12 +434,136 @@ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
|
|
380
434
|
/* Does nothing on purpose */
|
381
435
|
}
|
382
436
|
|
437
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
438
|
+
// It SHOULD NOT be used for other purposes.
|
383
439
|
static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
|
384
440
|
install_sigprof_signal_handler(testing_signal_handler);
|
385
441
|
return Qtrue;
|
386
442
|
}
|
387
443
|
|
444
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
445
|
+
// It SHOULD NOT be used for other purposes.
|
388
446
|
static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
|
389
447
|
remove_sigprof_signal_handler();
|
390
448
|
return Qtrue;
|
391
449
|
}
|
450
|
+
|
451
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
452
|
+
// It SHOULD NOT be used for other purposes.
|
453
|
+
static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self) {
|
454
|
+
sample_from_postponed_job(NULL);
|
455
|
+
return Qtrue;
|
456
|
+
}
|
457
|
+
|
458
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
459
|
+
// It SHOULD NOT be used for other purposes.
|
460
|
+
static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance) {
|
461
|
+
struct cpu_and_wall_time_worker_state *state;
|
462
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
463
|
+
|
464
|
+
return state->gc_tracepoint;
|
465
|
+
}
|
466
|
+
|
467
|
+
// Implements tracking of cpu-time and wall-time spent doing GC. This function is called by Ruby from the `gc_tracepoint`
|
468
|
+
// when the RUBY_INTERNAL_EVENT_GC_ENTER and RUBY_INTERNAL_EVENT_GC_EXIT events are triggered.
|
469
|
+
//
|
470
|
+
// See the comments on
|
471
|
+
// * cpu_and_wall_time_collector_on_gc_start
|
472
|
+
// * cpu_and_wall_time_collector_on_gc_finish
|
473
|
+
// * cpu_and_wall_time_collector_sample_after_gc
|
474
|
+
//
|
475
|
+
// For the expected times in which to call them, and their assumptions.
|
476
|
+
//
|
477
|
+
// Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
|
478
|
+
// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
|
479
|
+
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
|
480
|
+
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
|
481
|
+
if (!ddtrace_rb_ractor_main_p()) {
|
482
|
+
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
483
|
+
}
|
484
|
+
|
485
|
+
int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
|
486
|
+
if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
|
487
|
+
|
488
|
+
VALUE instance = active_sampler_instance; // Read from global variable
|
489
|
+
|
490
|
+
// This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
|
491
|
+
// and disabled before it is cleared, but just in case...
|
492
|
+
if (instance == Qnil) return;
|
493
|
+
|
494
|
+
struct cpu_and_wall_time_worker_state *state;
|
495
|
+
if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
|
496
|
+
// This should never fail the the above check passes
|
497
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
498
|
+
|
499
|
+
if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
|
500
|
+
cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
|
501
|
+
} else if (event == RUBY_INTERNAL_EVENT_GC_EXIT) {
|
502
|
+
// Design: In an earlier iteration of this feature (see https://github.com/DataDog/dd-trace-rb/pull/2308) we
|
503
|
+
// actually had a single method to implement the behavior of both cpu_and_wall_time_collector_on_gc_finish
|
504
|
+
// and cpu_and_wall_time_collector_sample_after_gc (the latter is called via after_gc_from_postponed_job).
|
505
|
+
//
|
506
|
+
// Unfortunately, then we discovered the safety issue around no allocations, and thus decided to separate them -- so that
|
507
|
+
// the sampling could run outside the tight safety constraints of the garbage collection process.
|
508
|
+
//
|
509
|
+
// There is a downside: The sample is now taken very very shortly afterwards the GC finishes, and not immediately
|
510
|
+
// as the GC finishes, which means the stack captured may by affected by "skid", e.g. point slightly after where
|
511
|
+
// it should be pointing at.
|
512
|
+
// Alternatives to solve this would be to capture no stack for garbage collection (as we do for Java and .net);
|
513
|
+
// making the sampling process allocation-safe (very hard); or separate stack sampling from sample recording,
|
514
|
+
// e.g. enabling us to capture the stack in cpu_and_wall_time_collector_on_gc_finish and do the rest later
|
515
|
+
// (medium hard).
|
516
|
+
|
517
|
+
cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
|
518
|
+
// We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
|
519
|
+
// fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
|
520
|
+
rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
|
521
|
+
}
|
522
|
+
}
|
523
|
+
|
524
|
+
static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
525
|
+
VALUE instance = active_sampler_instance; // Read from global variable
|
526
|
+
|
527
|
+
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
|
528
|
+
if (instance == Qnil) return;
|
529
|
+
|
530
|
+
// @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
|
531
|
+
// it's running on the main Ractor, but just in case...
|
532
|
+
if (!ddtrace_rb_ractor_main_p()) {
|
533
|
+
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
534
|
+
}
|
535
|
+
|
536
|
+
struct cpu_and_wall_time_worker_state *state;
|
537
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
538
|
+
|
539
|
+
// Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
|
540
|
+
safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, instance);
|
541
|
+
}
|
542
|
+
|
543
|
+
// Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
|
544
|
+
// exception gets raised within
|
545
|
+
static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
|
546
|
+
VALUE exception_handler_function_arg = instance;
|
547
|
+
rb_rescue2(
|
548
|
+
function_to_call_safely,
|
549
|
+
function_to_call_safely_arg,
|
550
|
+
handle_sampling_failure,
|
551
|
+
exception_handler_function_arg,
|
552
|
+
rb_eException, // rb_eException is the base class of all Ruby exceptions
|
553
|
+
0 // Required by API to be the last argument
|
554
|
+
);
|
555
|
+
}
|
556
|
+
|
557
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
558
|
+
// It SHOULD NOT be used for other purposes.
|
559
|
+
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self) {
|
560
|
+
handle_sampling_signal(0, NULL, NULL);
|
561
|
+
return Qtrue;
|
562
|
+
}
|
563
|
+
|
564
|
+
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
565
|
+
// It SHOULD NOT be used for other purposes.
|
566
|
+
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self) {
|
567
|
+
sample_from_postponed_job(NULL);
|
568
|
+
return Qtrue;
|
569
|
+
}
|