ddtrace 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -1
- data/README.md +2 -2
- data/ext/ddtrace_profiling_loader/extconf.rb +4 -1
- data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +1 -1
- data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +3 -2
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +15 -41
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +284 -74
- data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c +142 -0
- data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h +14 -0
- data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.c +241 -0
- data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.h +3 -0
- data/ext/ddtrace_profiling_native_extension/extconf.rb +21 -7
- data/ext/ddtrace_profiling_native_extension/helpers.h +5 -0
- data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +8 -0
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +108 -24
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +9 -0
- data/ext/ddtrace_profiling_native_extension/profiling.c +205 -0
- data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +86 -0
- data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +28 -6
- data/ext/ddtrace_profiling_native_extension/setup_signal_handler.c +23 -4
- data/ext/ddtrace_profiling_native_extension/setup_signal_handler.h +4 -0
- data/ext/ddtrace_profiling_native_extension/stack_recorder.c +15 -18
- data/ext/ddtrace_profiling_native_extension/time_helpers.c +17 -0
- data/ext/ddtrace_profiling_native_extension/time_helpers.h +10 -0
- data/lib/datadog/core/configuration/components.rb +27 -6
- data/lib/datadog/core/configuration/ext.rb +18 -0
- data/lib/datadog/core/configuration/settings.rb +14 -341
- data/lib/datadog/core/diagnostics/health.rb +4 -22
- data/lib/datadog/core/environment/variable_helpers.rb +58 -10
- data/lib/datadog/core/utils.rb +0 -21
- data/lib/datadog/core.rb +21 -1
- data/lib/datadog/opentracer/distributed_headers.rb +2 -2
- data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +16 -5
- data/lib/datadog/profiling/collectors/dynamic_sampling_rate.rb +14 -0
- data/lib/datadog/profiling/collectors/idle_sampling_helper.rb +68 -0
- data/lib/datadog/profiling/stack_recorder.rb +14 -0
- data/lib/datadog/profiling.rb +2 -0
- data/lib/datadog/tracing/configuration/ext.rb +33 -3
- data/lib/datadog/tracing/configuration/settings.rb +433 -0
- data/lib/datadog/tracing/contrib/aws/configuration/settings.rb +4 -1
- data/lib/datadog/tracing/contrib/aws/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/dalli/configuration/settings.rb +4 -1
- data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/elasticsearch/configuration/settings.rb +5 -1
- data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/ethon/configuration/settings.rb +6 -1
- data/lib/datadog/tracing/contrib/ethon/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/excon/configuration/settings.rb +5 -1
- data/lib/datadog/tracing/contrib/excon/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/faraday/configuration/settings.rb +5 -1
- data/lib/datadog/tracing/contrib/faraday/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/grpc/configuration/settings.rb +6 -1
- data/lib/datadog/tracing/contrib/grpc/distributed/propagation.rb +9 -4
- data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/http/configuration/settings.rb +6 -1
- data/lib/datadog/tracing/contrib/http/distributed/propagation.rb +9 -4
- data/lib/datadog/tracing/contrib/http/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/httpclient/configuration/settings.rb +6 -1
- data/lib/datadog/tracing/contrib/httpclient/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/httprb/configuration/settings.rb +6 -1
- data/lib/datadog/tracing/contrib/httprb/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/mongodb/configuration/settings.rb +5 -1
- data/lib/datadog/tracing/contrib/mongodb/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +4 -1
- data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +2 -2
- data/lib/datadog/tracing/contrib/patcher.rb +3 -2
- data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +4 -1
- data/lib/datadog/tracing/contrib/pg/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/pg/instrumentation.rb +12 -2
- data/lib/datadog/tracing/contrib/presto/configuration/settings.rb +4 -1
- data/lib/datadog/tracing/contrib/presto/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +10 -12
- data/lib/datadog/tracing/contrib/redis/configuration/settings.rb +4 -1
- data/lib/datadog/tracing/contrib/redis/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/redis/instrumentation.rb +30 -23
- data/lib/datadog/tracing/contrib/redis/integration.rb +34 -2
- data/lib/datadog/tracing/contrib/redis/patcher.rb +18 -14
- data/lib/datadog/tracing/contrib/redis/quantize.rb +12 -9
- data/lib/datadog/tracing/contrib/redis/tags.rb +4 -6
- data/lib/datadog/tracing/contrib/redis/trace_middleware.rb +72 -0
- data/lib/datadog/tracing/contrib/rest_client/configuration/settings.rb +6 -1
- data/lib/datadog/tracing/contrib/rest_client/ext.rb +1 -0
- data/lib/datadog/{core → tracing}/diagnostics/ext.rb +1 -6
- data/lib/datadog/tracing/diagnostics/health.rb +40 -0
- data/lib/datadog/tracing/distributed/{b3.rb → b3_multi.rb} +2 -2
- data/lib/datadog/tracing/distributed/helpers.rb +2 -1
- data/lib/datadog/tracing/distributed/none.rb +19 -0
- data/lib/datadog/tracing/distributed/trace_context.rb +369 -0
- data/lib/datadog/tracing/metadata/ext.rb +1 -1
- data/lib/datadog/tracing/sampling/priority_sampler.rb +11 -0
- data/lib/datadog/tracing/sampling/rate_sampler.rb +3 -3
- data/lib/datadog/tracing/span.rb +3 -19
- data/lib/datadog/tracing/span_operation.rb +5 -4
- data/lib/datadog/tracing/trace_digest.rb +75 -2
- data/lib/datadog/tracing/trace_operation.rb +5 -4
- data/lib/datadog/tracing/utils.rb +50 -0
- data/lib/ddtrace/version.rb +1 -1
- metadata +20 -5
|
@@ -3,12 +3,18 @@
|
|
|
3
3
|
#include <ruby/thread_native.h>
|
|
4
4
|
#include <ruby/debug.h>
|
|
5
5
|
#include <stdbool.h>
|
|
6
|
+
#include <stdatomic.h>
|
|
6
7
|
#include <signal.h>
|
|
8
|
+
#include <errno.h>
|
|
9
|
+
|
|
7
10
|
#include "helpers.h"
|
|
8
11
|
#include "ruby_helpers.h"
|
|
9
12
|
#include "collectors_cpu_and_wall_time.h"
|
|
13
|
+
#include "collectors_dynamic_sampling_rate.h"
|
|
14
|
+
#include "collectors_idle_sampling_helper.h"
|
|
10
15
|
#include "private_vm_api_access.h"
|
|
11
16
|
#include "setup_signal_handler.h"
|
|
17
|
+
#include "time_helpers.h"
|
|
12
18
|
|
|
13
19
|
// Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
|
|
14
20
|
// itself; this class only implements the "doing it periodically" part.
|
|
@@ -71,19 +77,42 @@
|
|
|
71
77
|
|
|
72
78
|
// Contains state for a single CpuAndWallTimeWorker instance
|
|
73
79
|
struct cpu_and_wall_time_worker_state {
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
// atomic operations. stdatomic.h seems a nice thing to reach out for.
|
|
77
|
-
volatile bool should_run;
|
|
80
|
+
atomic_bool should_run;
|
|
81
|
+
|
|
78
82
|
bool gc_profiling_enabled;
|
|
83
|
+
VALUE self_instance;
|
|
79
84
|
VALUE cpu_and_wall_time_collector_instance;
|
|
85
|
+
VALUE idle_sampling_helper_instance;
|
|
86
|
+
VALUE owner_thread;
|
|
87
|
+
dynamic_sampling_rate_state dynamic_sampling_rate;
|
|
80
88
|
|
|
81
89
|
// When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
|
|
82
90
|
// the CpuAndWallTimeWorker thread
|
|
83
91
|
VALUE failure_exception;
|
|
92
|
+
// Used by `_native_stop` to flag the worker thread to start (see comment on `_native_sampling_loop`)
|
|
93
|
+
VALUE stop_thread;
|
|
84
94
|
|
|
85
95
|
// Used to get gc start/finish information
|
|
86
96
|
VALUE gc_tracepoint;
|
|
97
|
+
|
|
98
|
+
struct stats {
|
|
99
|
+
// How many times we tried to trigger a sample
|
|
100
|
+
unsigned int trigger_sample_attempts;
|
|
101
|
+
// How many times we tried to simulate signal delivery
|
|
102
|
+
unsigned int trigger_simulated_signal_delivery_attempts;
|
|
103
|
+
// How many times we actually simulated signal delivery
|
|
104
|
+
unsigned int simulated_signal_delivery;
|
|
105
|
+
// How many times we actually called rb_postponed_job_register_one from a signal handler
|
|
106
|
+
unsigned int signal_handler_enqueued_sample;
|
|
107
|
+
// How many times the signal handler was called from the wrong thread
|
|
108
|
+
unsigned int signal_handler_wrong_thread;
|
|
109
|
+
// How many times we actually sampled (except GC samples)
|
|
110
|
+
unsigned int sampled;
|
|
111
|
+
// Min/max/total wall-time spent sampling (except GC samples)
|
|
112
|
+
uint64_t sampling_time_ns_min;
|
|
113
|
+
uint64_t sampling_time_ns_max;
|
|
114
|
+
uint64_t sampling_time_ns_total;
|
|
115
|
+
} stats;
|
|
87
116
|
};
|
|
88
117
|
|
|
89
118
|
static VALUE _native_new(VALUE klass);
|
|
@@ -91,16 +120,18 @@ static VALUE _native_initialize(
|
|
|
91
120
|
DDTRACE_UNUSED VALUE _self,
|
|
92
121
|
VALUE self_instance,
|
|
93
122
|
VALUE cpu_and_wall_time_collector_instance,
|
|
94
|
-
VALUE gc_profiling_enabled
|
|
123
|
+
VALUE gc_profiling_enabled,
|
|
124
|
+
VALUE idle_sampling_helper_instance
|
|
95
125
|
);
|
|
96
126
|
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
|
|
97
127
|
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
|
|
98
|
-
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
|
|
128
|
+
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread);
|
|
99
129
|
static VALUE stop(VALUE self_instance, VALUE optional_exception);
|
|
100
130
|
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
|
|
101
131
|
static void *run_sampling_trigger_loop(void *state_ptr);
|
|
102
132
|
static void interrupt_sampling_trigger_loop(void *state_ptr);
|
|
103
133
|
static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
|
|
134
|
+
static VALUE rescued_sample_from_postponed_job(VALUE self_instance);
|
|
104
135
|
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
|
|
105
136
|
static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
|
|
106
137
|
static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
|
|
@@ -112,23 +143,30 @@ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
|
|
|
112
143
|
static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
|
|
113
144
|
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
|
|
114
145
|
static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
|
|
115
|
-
static
|
|
146
|
+
static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
|
|
116
147
|
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
|
|
117
148
|
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
|
|
118
149
|
static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance);
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
150
|
+
static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self);
|
|
151
|
+
static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance);
|
|
152
|
+
void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused);
|
|
153
|
+
static void grab_gvl_and_sample(void);
|
|
154
|
+
static void reset_stats(struct cpu_and_wall_time_worker_state *state);
|
|
155
|
+
static void sleep_for(uint64_t time_ns);
|
|
156
|
+
|
|
157
|
+
// Note on sampler global state safety:
|
|
158
|
+
//
|
|
159
|
+
// Both `active_sampler_instance` and `active_sampler_instance_state` are **GLOBAL** state. Be careful when accessing
|
|
160
|
+
// or modifying them.
|
|
161
|
+
// In particular, it's important to only mutate them while holding the global VM lock, to ensure correctness.
|
|
162
|
+
//
|
|
163
|
+
// This global state is needed because a bunch of functions on this file need to access it from situations
|
|
164
|
+
// (e.g. signal handler) where it's impossible or just awkward to pass it as an argument.
|
|
124
165
|
static VALUE active_sampler_instance = Qnil;
|
|
125
|
-
|
|
126
|
-
// to detect when it is outdated)
|
|
127
|
-
static VALUE active_sampler_owner_thread = Qnil;
|
|
166
|
+
struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;
|
|
128
167
|
|
|
129
168
|
void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
|
|
130
169
|
rb_global_variable(&active_sampler_instance);
|
|
131
|
-
rb_global_variable(&active_sampler_owner_thread);
|
|
132
170
|
|
|
133
171
|
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
|
|
134
172
|
VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
|
|
@@ -145,10 +183,11 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
|
|
|
145
183
|
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
|
|
146
184
|
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
|
|
147
185
|
|
|
148
|
-
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize,
|
|
186
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 4);
|
|
149
187
|
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
|
|
150
|
-
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop,
|
|
188
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
|
|
151
189
|
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
|
|
190
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stats", _native_stats, 1);
|
|
152
191
|
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
|
|
153
192
|
rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
|
|
154
193
|
rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
|
|
@@ -157,6 +196,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
|
|
|
157
196
|
rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
|
|
158
197
|
rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
|
|
159
198
|
rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
|
|
199
|
+
rb_define_singleton_method(testing_module, "_native_is_sigprof_blocked_in_current_thread", _native_is_sigprof_blocked_in_current_thread, 0);
|
|
160
200
|
}
|
|
161
201
|
|
|
162
202
|
// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
|
|
@@ -166,7 +206,7 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
|
|
|
166
206
|
.function = {
|
|
167
207
|
.dmark = cpu_and_wall_time_worker_typed_data_mark,
|
|
168
208
|
.dfree = RUBY_DEFAULT_FREE,
|
|
169
|
-
.dsize = NULL, // We don't track
|
|
209
|
+
.dsize = NULL, // We don't track memory usage (although it'd be cool if we did!)
|
|
170
210
|
//.dcompact = NULL, // FIXME: Add support for compaction
|
|
171
211
|
},
|
|
172
212
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
|
@@ -175,20 +215,26 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
|
|
|
175
215
|
static VALUE _native_new(VALUE klass) {
|
|
176
216
|
struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
|
|
177
217
|
|
|
178
|
-
state->should_run
|
|
218
|
+
atomic_init(&state->should_run, false);
|
|
179
219
|
state->gc_profiling_enabled = false;
|
|
180
220
|
state->cpu_and_wall_time_collector_instance = Qnil;
|
|
221
|
+
state->idle_sampling_helper_instance = Qnil;
|
|
222
|
+
state->owner_thread = Qnil;
|
|
223
|
+
dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
|
|
181
224
|
state->failure_exception = Qnil;
|
|
225
|
+
state->stop_thread = Qnil;
|
|
182
226
|
state->gc_tracepoint = Qnil;
|
|
227
|
+
reset_stats(state);
|
|
183
228
|
|
|
184
|
-
return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
|
|
229
|
+
return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
|
|
185
230
|
}
|
|
186
231
|
|
|
187
232
|
static VALUE _native_initialize(
|
|
188
233
|
DDTRACE_UNUSED VALUE _self,
|
|
189
234
|
VALUE self_instance,
|
|
190
235
|
VALUE cpu_and_wall_time_collector_instance,
|
|
191
|
-
VALUE gc_profiling_enabled
|
|
236
|
+
VALUE gc_profiling_enabled,
|
|
237
|
+
VALUE idle_sampling_helper_instance
|
|
192
238
|
) {
|
|
193
239
|
ENFORCE_BOOLEAN(gc_profiling_enabled);
|
|
194
240
|
|
|
@@ -197,6 +243,7 @@ static VALUE _native_initialize(
|
|
|
197
243
|
|
|
198
244
|
state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
|
|
199
245
|
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
|
|
246
|
+
state->idle_sampling_helper_instance = idle_sampling_helper_instance;
|
|
200
247
|
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
|
|
201
248
|
|
|
202
249
|
return Qtrue;
|
|
@@ -207,7 +254,10 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
|
|
|
207
254
|
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
|
208
255
|
|
|
209
256
|
rb_gc_mark(state->cpu_and_wall_time_collector_instance);
|
|
257
|
+
rb_gc_mark(state->idle_sampling_helper_instance);
|
|
258
|
+
rb_gc_mark(state->owner_thread);
|
|
210
259
|
rb_gc_mark(state->failure_exception);
|
|
260
|
+
rb_gc_mark(state->stop_thread);
|
|
211
261
|
rb_gc_mark(state->gc_tracepoint);
|
|
212
262
|
}
|
|
213
263
|
|
|
@@ -216,8 +266,9 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
|
216
266
|
struct cpu_and_wall_time_worker_state *state;
|
|
217
267
|
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
218
268
|
|
|
219
|
-
|
|
220
|
-
|
|
269
|
+
struct cpu_and_wall_time_worker_state *old_state = active_sampler_instance_state;
|
|
270
|
+
if (old_state != NULL) {
|
|
271
|
+
if (is_thread_alive(old_state->owner_thread)) {
|
|
221
272
|
rb_raise(
|
|
222
273
|
rb_eRuntimeError,
|
|
223
274
|
"Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
|
|
@@ -231,23 +282,26 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
|
231
282
|
// b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
|
|
232
283
|
// enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
|
|
233
284
|
|
|
234
|
-
struct cpu_and_wall_time_worker_state *old_state;
|
|
235
|
-
TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
|
|
236
285
|
rb_tracepoint_disable(old_state->gc_tracepoint);
|
|
237
286
|
}
|
|
238
287
|
}
|
|
239
288
|
|
|
289
|
+
// We use `stop_thread` to distinguish when `_native_stop` was called before we actually had a chance to start. In this
|
|
290
|
+
// situation we stop immediately and never even start the sampling trigger loop.
|
|
291
|
+
if (state->stop_thread == rb_thread_current()) return Qnil;
|
|
292
|
+
|
|
293
|
+
// Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
|
|
294
|
+
dynamic_sampling_rate_reset(&state->dynamic_sampling_rate);
|
|
295
|
+
|
|
240
296
|
// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
|
|
297
|
+
active_sampler_instance_state = state;
|
|
241
298
|
active_sampler_instance = instance;
|
|
242
|
-
|
|
299
|
+
state->owner_thread = rb_thread_current();
|
|
243
300
|
|
|
244
|
-
state->should_run
|
|
301
|
+
atomic_store(&state->should_run, true);
|
|
245
302
|
|
|
246
303
|
block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
|
|
247
304
|
|
|
248
|
-
install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
|
|
249
|
-
if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
|
|
250
|
-
|
|
251
305
|
// Release GVL, get to the actual work!
|
|
252
306
|
int exception_state;
|
|
253
307
|
rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
|
|
@@ -256,6 +310,19 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
|
256
310
|
|
|
257
311
|
rb_tracepoint_disable(state->gc_tracepoint);
|
|
258
312
|
|
|
313
|
+
active_sampler_instance_state = NULL;
|
|
314
|
+
active_sampler_instance = Qnil;
|
|
315
|
+
state->owner_thread = Qnil;
|
|
316
|
+
|
|
317
|
+
// If this `Thread` is about to die, why is this important? It's because Ruby caches native threads for a period after
|
|
318
|
+
// the `Thread` dies, and reuses them if a new Ruby `Thread` gets created. This means that while conceptually the
|
|
319
|
+
// worker background `Thread` is about to die, the low-level native OS thread can be reused for something else in the Ruby app.
|
|
320
|
+
// Then, the reused thread would "inherit" the SIGPROF blocking, which is... really unexpected.
|
|
321
|
+
// This actually caused a flaky test -- the `native_extension_spec.rb` creates a `Thread` and tries to specifically
|
|
322
|
+
// send SIGPROF signals to it, and oops it could fail if it got the reused native thread from the worker which still
|
|
323
|
+
// had SIGPROF delivery blocked. :hide_the_pain_harold:
|
|
324
|
+
unblock_sigprof_signal_handler_from_running_in_current_thread();
|
|
325
|
+
|
|
259
326
|
// Why replace and not use remove the signal handler? We do this because when a process receives a SIGPROF without
|
|
260
327
|
// having an explicit signal handler set up, the process will instantly terminate with a confusing
|
|
261
328
|
// "Profiling timer expired" message left behind. (This message doesn't come from us -- it's the default message for
|
|
@@ -265,11 +332,10 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
|
265
332
|
// profiler-sent signals by the time we get here and want to clean up.
|
|
266
333
|
// @ivoanjo: I suspect this will never happen, but the cost of getting it wrong is really high (VM terminates) so this
|
|
267
334
|
// is a just-in-case situation.
|
|
335
|
+
//
|
|
336
|
+
// Note 2: This can raise exceptions as well, so make sure that all cleanups are done by the time we get here.
|
|
268
337
|
replace_sigprof_signal_handler_with_empty_handler(handle_sampling_signal);
|
|
269
338
|
|
|
270
|
-
active_sampler_instance = Qnil;
|
|
271
|
-
active_sampler_owner_thread = Qnil;
|
|
272
|
-
|
|
273
339
|
// Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
|
|
274
340
|
RB_GC_GUARD(instance);
|
|
275
341
|
|
|
@@ -278,7 +344,12 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
|
|
278
344
|
return Qnil;
|
|
279
345
|
}
|
|
280
346
|
|
|
281
|
-
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
|
|
347
|
+
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread) {
|
|
348
|
+
struct cpu_and_wall_time_worker_state *state;
|
|
349
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
350
|
+
|
|
351
|
+
state->stop_thread = worker_thread;
|
|
352
|
+
|
|
282
353
|
return stop(self_instance, /* optional_exception: */ Qnil);
|
|
283
354
|
}
|
|
284
355
|
|
|
@@ -286,7 +357,7 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
|
|
|
286
357
|
struct cpu_and_wall_time_worker_state *state;
|
|
287
358
|
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
288
359
|
|
|
289
|
-
state->should_run
|
|
360
|
+
atomic_store(&state->should_run, false);
|
|
290
361
|
state->failure_exception = optional_exception;
|
|
291
362
|
|
|
292
363
|
// Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
|
|
@@ -299,41 +370,76 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
|
|
|
299
370
|
// We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
|
|
300
371
|
// we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
|
|
301
372
|
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
if (
|
|
306
|
-
|
|
373
|
+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
|
|
374
|
+
|
|
375
|
+
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the signal delivery was happening; nothing to do
|
|
376
|
+
if (state == NULL) return;
|
|
377
|
+
|
|
378
|
+
if (
|
|
379
|
+
!ruby_native_thread_p() || // Not a Ruby thread
|
|
380
|
+
!is_current_thread_holding_the_gvl() || // Not safe to enqueue a sample from this thread
|
|
381
|
+
!ddtrace_rb_ractor_main_p() // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
|
382
|
+
) {
|
|
383
|
+
state->stats.signal_handler_wrong_thread++;
|
|
384
|
+
return;
|
|
307
385
|
}
|
|
308
386
|
|
|
309
387
|
// We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
|
|
310
388
|
// a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
|
|
311
389
|
// b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
|
|
312
390
|
// because it will not have the global VM lock
|
|
313
|
-
// TODO: Validate that this does not impact Ractors
|
|
314
391
|
|
|
315
392
|
// Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
|
|
316
393
|
// then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
|
|
317
394
|
// may want to implement that check ourselves.
|
|
318
395
|
|
|
319
|
-
|
|
396
|
+
state->stats.signal_handler_enqueued_sample++;
|
|
397
|
+
|
|
398
|
+
// Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
|
|
399
|
+
// this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
|
|
320
400
|
/*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
|
|
401
|
+
// TODO: Do something with result (potentially update tracking counters?)
|
|
321
402
|
}
|
|
322
403
|
|
|
323
404
|
// The actual sampling trigger loop always runs **without** the global vm lock.
|
|
324
405
|
static void *run_sampling_trigger_loop(void *state_ptr) {
|
|
325
406
|
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
|
326
407
|
|
|
327
|
-
|
|
408
|
+
uint64_t minimum_time_between_signals = MILLIS_AS_NS(10);
|
|
409
|
+
|
|
410
|
+
while (atomic_load(&state->should_run)) {
|
|
411
|
+
state->stats.trigger_sample_attempts++;
|
|
328
412
|
|
|
329
|
-
while (state->should_run) {
|
|
330
413
|
// TODO: This is still a placeholder for a more complex mechanism. In particular:
|
|
331
|
-
// * We want to signal a particular thread or threads, not the process in general
|
|
332
|
-
// * We want to track if a signal landed on the thread holding the global VM lock and do something about it
|
|
333
414
|
// * We want to do more than having a fixed sampling rate
|
|
334
415
|
|
|
335
|
-
|
|
336
|
-
|
|
416
|
+
current_gvl_owner owner = gvl_owner();
|
|
417
|
+
if (owner.valid) {
|
|
418
|
+
// Note that reading the GVL owner and sending them a signal is a race -- the Ruby VM keeps on executing while
|
|
419
|
+
// we're doing this, so we may still not signal the correct thread from time to time, but our signal handler
|
|
420
|
+
// includes a check to see if it got called in the right thread
|
|
421
|
+
pthread_kill(owner.owner, SIGPROF);
|
|
422
|
+
} else {
|
|
423
|
+
// If no thread owns the Global VM Lock, the application is probably idle at the moment. We still want to sample
|
|
424
|
+
// so we "ask a friend" (the IdleSamplingHelper component) to grab the GVL and simulate getting a SIGPROF.
|
|
425
|
+
//
|
|
426
|
+
// In a previous version of the code, we called `grab_gvl_and_sample` directly BUT this was problematic because
|
|
427
|
+
// Ruby may concurrently get busy and so the CpuAndWallTimeWorker would be blocked in line to acquire the GVL
|
|
428
|
+
// for an uncontrolled amount of time. (This can still happen to the IdleSamplingHelper, but the
|
|
429
|
+
// CpuAndWallTimeWorker will still be free to interrupt the Ruby VM and keep sampling for the entire blocking period).
|
|
430
|
+
state->stats.trigger_simulated_signal_delivery_attempts++;
|
|
431
|
+
idle_sampling_helper_request_action(state->idle_sampling_helper_instance, grab_gvl_and_sample);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
sleep_for(minimum_time_between_signals);
|
|
435
|
+
|
|
436
|
+
// The dynamic sampling rate module keeps track of how long samples are taking, and in here we extend our sleep time
|
|
437
|
+
// to take that into account.
|
|
438
|
+
// Note that we deliberately should NOT combine this sleep_for with the one above because the result of
|
|
439
|
+
// `dynamic_sampling_rate_get_sleep` may have changed while the above sleep was ongoing.
|
|
440
|
+
uint64_t extra_sleep =
|
|
441
|
+
dynamic_sampling_rate_get_sleep(&state->dynamic_sampling_rate, monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE));
|
|
442
|
+
if (extra_sleep > 0) sleep_for(extra_sleep);
|
|
337
443
|
}
|
|
338
444
|
|
|
339
445
|
return NULL; // Unused
|
|
@@ -343,14 +449,14 @@ static void *run_sampling_trigger_loop(void *state_ptr) {
|
|
|
343
449
|
static void interrupt_sampling_trigger_loop(void *state_ptr) {
|
|
344
450
|
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
|
345
451
|
|
|
346
|
-
state->should_run
|
|
452
|
+
atomic_store(&state->should_run, false);
|
|
347
453
|
}
|
|
348
454
|
|
|
349
455
|
static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
|
350
|
-
|
|
456
|
+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
|
|
351
457
|
|
|
352
458
|
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
|
|
353
|
-
if (
|
|
459
|
+
if (state == NULL) return;
|
|
354
460
|
|
|
355
461
|
// @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
|
|
356
462
|
// it's running on the main Ractor, but just in case...
|
|
@@ -358,14 +464,45 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
|
|
358
464
|
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
|
359
465
|
}
|
|
360
466
|
|
|
467
|
+
// Rescue against any exceptions that happen during sampling
|
|
468
|
+
safely_call(rescued_sample_from_postponed_job, state->self_instance, state->self_instance);
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {
|
|
361
472
|
struct cpu_and_wall_time_worker_state *state;
|
|
362
|
-
TypedData_Get_Struct(
|
|
473
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
363
474
|
|
|
364
|
-
|
|
365
|
-
|
|
475
|
+
long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
|
|
476
|
+
|
|
477
|
+
if (!dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
|
|
478
|
+
// TODO: Add a counter for this
|
|
479
|
+
return Qnil;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
state->stats.sampled++;
|
|
483
|
+
|
|
484
|
+
cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample);
|
|
485
|
+
|
|
486
|
+
long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
|
|
487
|
+
long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
|
|
488
|
+
|
|
489
|
+
// Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
|
|
490
|
+
uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;
|
|
491
|
+
|
|
492
|
+
state->stats.sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.sampling_time_ns_min);
|
|
493
|
+
state->stats.sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.sampling_time_ns_max);
|
|
494
|
+
state->stats.sampling_time_ns_total += sampling_time_ns;
|
|
495
|
+
|
|
496
|
+
dynamic_sampling_rate_after_sample(&state->dynamic_sampling_rate, wall_time_ns_after_sample, sampling_time_ns);
|
|
497
|
+
|
|
498
|
+
// Return a dummy VALUE because we're called from rb_rescue2 which requires it
|
|
499
|
+
return Qnil;
|
|
366
500
|
}
|
|
367
501
|
|
|
368
|
-
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
|
|
502
|
+
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
|
|
503
|
+
stop(self_instance, exception);
|
|
504
|
+
return Qnil;
|
|
505
|
+
}
|
|
369
506
|
|
|
370
507
|
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
|
371
508
|
// It SHOULD NOT be used for other purposes.
|
|
@@ -390,6 +527,11 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
|
|
|
390
527
|
struct cpu_and_wall_time_worker_state *state;
|
|
391
528
|
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
392
529
|
|
|
530
|
+
// Final preparations: Setup signal handler and enable tracepoint. We run these here and not in `_native_sampling_loop`
|
|
531
|
+
// because they may raise exceptions.
|
|
532
|
+
install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
|
|
533
|
+
if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
|
|
534
|
+
|
|
393
535
|
rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
|
|
394
536
|
|
|
395
537
|
// If we stopped sampling due to an exception, re-raise it (now in the worker thread)
|
|
@@ -401,9 +543,9 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
|
|
|
401
543
|
// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
|
|
402
544
|
// It SHOULD NOT be used for other purposes.
|
|
403
545
|
static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
546
|
+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
|
|
547
|
+
|
|
548
|
+
return (state != NULL && is_thread_alive(state->owner_thread) && state->self_instance == instance) ? Qtrue : Qfalse;
|
|
407
549
|
}
|
|
408
550
|
|
|
409
551
|
static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
|
@@ -461,16 +603,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
|
|
|
461
603
|
int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
|
|
462
604
|
if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
|
|
463
605
|
|
|
464
|
-
|
|
606
|
+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
|
|
465
607
|
|
|
466
608
|
// This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
|
|
467
609
|
// and disabled before it is cleared, but just in case...
|
|
468
|
-
if (
|
|
469
|
-
|
|
470
|
-
struct cpu_and_wall_time_worker_state *state;
|
|
471
|
-
if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
|
|
472
|
-
// This should never fail the the above check passes
|
|
473
|
-
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
610
|
+
if (state == NULL) return;
|
|
474
611
|
|
|
475
612
|
if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
|
|
476
613
|
cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
|
|
@@ -493,15 +630,18 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
|
|
|
493
630
|
cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
|
|
494
631
|
// We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
|
|
495
632
|
// fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
|
|
633
|
+
//
|
|
634
|
+
// Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
|
|
635
|
+
// this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
|
|
496
636
|
rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
|
|
497
637
|
}
|
|
498
638
|
}
|
|
499
639
|
|
|
500
640
|
static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
|
501
|
-
|
|
641
|
+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
|
|
502
642
|
|
|
503
643
|
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
|
|
504
|
-
if (
|
|
644
|
+
if (state == NULL) return;
|
|
505
645
|
|
|
506
646
|
// @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
|
|
507
647
|
// it's running on the main Ractor, but just in case...
|
|
@@ -509,18 +649,15 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
|
|
509
649
|
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
|
|
510
650
|
}
|
|
511
651
|
|
|
512
|
-
struct cpu_and_wall_time_worker_state *state;
|
|
513
|
-
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
514
|
-
|
|
515
652
|
// Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
|
|
516
|
-
safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance,
|
|
653
|
+
safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, state->self_instance);
|
|
517
654
|
}
|
|
518
655
|
|
|
519
656
|
// Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
|
|
520
657
|
// exception gets raised within
|
|
521
|
-
static
|
|
658
|
+
static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
|
|
522
659
|
VALUE exception_handler_function_arg = instance;
|
|
523
|
-
rb_rescue2(
|
|
660
|
+
return rb_rescue2(
|
|
524
661
|
function_to_call_safely,
|
|
525
662
|
function_to_call_safely_arg,
|
|
526
663
|
handle_sampling_failure,
|
|
@@ -559,8 +696,81 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance)
|
|
|
559
696
|
// Disable all tracepoints, so that there are no more attempts to mutate the profile
|
|
560
697
|
rb_tracepoint_disable(state->gc_tracepoint);
|
|
561
698
|
|
|
699
|
+
reset_stats(state);
|
|
700
|
+
|
|
562
701
|
// Remove all state from the `Collectors::CpuAndWallTime` and connected downstream components
|
|
563
702
|
rb_funcall(state->cpu_and_wall_time_collector_instance, rb_intern("reset_after_fork"), 0);
|
|
564
703
|
|
|
565
704
|
return Qtrue;
|
|
566
705
|
}
|
|
706
|
+
|
|
707
|
+
static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self) {
|
|
708
|
+
return is_sigprof_blocked_in_current_thread();
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
|
|
712
|
+
struct cpu_and_wall_time_worker_state *state;
|
|
713
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
|
714
|
+
|
|
715
|
+
VALUE pretty_sampling_time_ns_min = state->stats.sampling_time_ns_min == UINT64_MAX ? Qnil : ULL2NUM(state->stats.sampling_time_ns_min);
|
|
716
|
+
VALUE pretty_sampling_time_ns_max = state->stats.sampling_time_ns_max == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_max);
|
|
717
|
+
VALUE pretty_sampling_time_ns_total = state->stats.sampling_time_ns_total == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_total);
|
|
718
|
+
VALUE pretty_sampling_time_ns_avg =
|
|
719
|
+
state->stats.sampled == 0 ? Qnil : DBL2NUM(((double) state->stats.sampling_time_ns_total) / state->stats.sampled);
|
|
720
|
+
|
|
721
|
+
VALUE stats_as_hash = rb_hash_new();
|
|
722
|
+
VALUE arguments[] = {
|
|
723
|
+
ID2SYM(rb_intern("trigger_sample_attempts")), /* => */ UINT2NUM(state->stats.trigger_sample_attempts),
|
|
724
|
+
ID2SYM(rb_intern("trigger_simulated_signal_delivery_attempts")), /* => */ UINT2NUM(state->stats.trigger_simulated_signal_delivery_attempts),
|
|
725
|
+
ID2SYM(rb_intern("simulated_signal_delivery")), /* => */ UINT2NUM(state->stats.simulated_signal_delivery),
|
|
726
|
+
ID2SYM(rb_intern("signal_handler_enqueued_sample")), /* => */ UINT2NUM(state->stats.signal_handler_enqueued_sample),
|
|
727
|
+
ID2SYM(rb_intern("signal_handler_wrong_thread")), /* => */ UINT2NUM(state->stats.signal_handler_wrong_thread),
|
|
728
|
+
ID2SYM(rb_intern("sampled")), /* => */ UINT2NUM(state->stats.sampled),
|
|
729
|
+
ID2SYM(rb_intern("sampling_time_ns_min")), /* => */ pretty_sampling_time_ns_min,
|
|
730
|
+
ID2SYM(rb_intern("sampling_time_ns_max")), /* => */ pretty_sampling_time_ns_max,
|
|
731
|
+
ID2SYM(rb_intern("sampling_time_ns_total")), /* => */ pretty_sampling_time_ns_total,
|
|
732
|
+
ID2SYM(rb_intern("sampling_time_ns_avg")), /* => */ pretty_sampling_time_ns_avg,
|
|
733
|
+
};
|
|
734
|
+
for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
|
|
735
|
+
return stats_as_hash;
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused) {
|
|
739
|
+
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
|
|
740
|
+
|
|
741
|
+
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the IdleSamplingHelper was trying to execute this action
|
|
742
|
+
if (state == NULL) return NULL;
|
|
743
|
+
|
|
744
|
+
state->stats.simulated_signal_delivery++;
|
|
745
|
+
|
|
746
|
+
// @ivoanjo: We could instead directly call sample_from_postponed_job, but I chose to go through the signal handler
|
|
747
|
+
// so that the simulated case is as close to the original one as well (including any metrics increases, etc).
|
|
748
|
+
handle_sampling_signal(0, NULL, NULL);
|
|
749
|
+
|
|
750
|
+
return NULL; // Unused
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
static void grab_gvl_and_sample(void) { rb_thread_call_with_gvl(simulate_sampling_signal_delivery, NULL); }
|
|
754
|
+
|
|
755
|
+
static void reset_stats(struct cpu_and_wall_time_worker_state *state) {
|
|
756
|
+
state->stats = (struct stats) {}; // Resets all stats back to zero
|
|
757
|
+
state->stats.sampling_time_ns_min = UINT64_MAX; // Since we always take the min between existing and latest sample
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
static void sleep_for(uint64_t time_ns) {
|
|
761
|
+
// As a simplification, we currently only support setting .tv_nsec
|
|
762
|
+
if (time_ns >= SECONDS_AS_NS(1)) {
|
|
763
|
+
grab_gvl_and_raise(rb_eArgError, "sleep_for can only sleep for less than 1 second, time_ns: %"PRIu64, time_ns);
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
struct timespec time_to_sleep = {.tv_nsec = time_ns};
|
|
767
|
+
|
|
768
|
+
while (nanosleep(&time_to_sleep, &time_to_sleep) != 0) {
|
|
769
|
+
if (errno == EINTR) {
|
|
770
|
+
// We were interrupted. nanosleep updates "time_to_sleep" to contain only the remaining time, so we just let the
|
|
771
|
+
// loop keep going.
|
|
772
|
+
} else {
|
|
773
|
+
ENFORCE_SUCCESS_NO_GVL(errno);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
}
|