datadog 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/ext/datadog_profiling_loader/datadog_profiling_loader.c +9 -1
- data/ext/datadog_profiling_loader/extconf.rb +10 -22
- data/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +148 -30
- data/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c +4 -2
- data/ext/datadog_profiling_native_extension/collectors_stack.c +89 -46
- data/ext/datadog_profiling_native_extension/collectors_thread_context.c +580 -29
- data/ext/datadog_profiling_native_extension/collectors_thread_context.h +9 -1
- data/ext/datadog_profiling_native_extension/datadog_ruby_common.c +0 -27
- data/ext/datadog_profiling_native_extension/datadog_ruby_common.h +0 -4
- data/ext/datadog_profiling_native_extension/extconf.rb +38 -21
- data/ext/datadog_profiling_native_extension/gvl_profiling_helper.c +50 -0
- data/ext/datadog_profiling_native_extension/gvl_profiling_helper.h +75 -0
- data/ext/datadog_profiling_native_extension/heap_recorder.c +20 -6
- data/ext/datadog_profiling_native_extension/http_transport.c +38 -6
- data/ext/datadog_profiling_native_extension/private_vm_api_access.c +52 -1
- data/ext/datadog_profiling_native_extension/private_vm_api_access.h +3 -0
- data/ext/datadog_profiling_native_extension/profiling.c +1 -1
- data/ext/datadog_profiling_native_extension/stack_recorder.h +1 -0
- data/ext/libdatadog_api/crashtracker.c +20 -18
- data/ext/libdatadog_api/datadog_ruby_common.c +0 -27
- data/ext/libdatadog_api/datadog_ruby_common.h +0 -4
- data/ext/libdatadog_extconf_helpers.rb +1 -1
- data/lib/datadog/appsec/assets/waf_rules/recommended.json +2184 -108
- data/lib/datadog/appsec/assets/waf_rules/strict.json +1430 -2
- data/lib/datadog/appsec/component.rb +29 -8
- data/lib/datadog/appsec/configuration/settings.rb +2 -2
- data/lib/datadog/appsec/contrib/devise/patcher/authenticatable_patch.rb +1 -0
- data/lib/datadog/appsec/contrib/devise/patcher/rememberable_patch.rb +21 -0
- data/lib/datadog/appsec/contrib/devise/patcher.rb +12 -2
- data/lib/datadog/appsec/contrib/graphql/appsec_trace.rb +0 -14
- data/lib/datadog/appsec/contrib/graphql/gateway/multiplex.rb +67 -31
- data/lib/datadog/appsec/contrib/graphql/gateway/watcher.rb +18 -15
- data/lib/datadog/appsec/contrib/graphql/integration.rb +14 -1
- data/lib/datadog/appsec/contrib/rack/gateway/request.rb +2 -5
- data/lib/datadog/appsec/event.rb +1 -1
- data/lib/datadog/appsec/processor/rule_loader.rb +3 -1
- data/lib/datadog/appsec/processor/rule_merger.rb +33 -15
- data/lib/datadog/appsec/processor.rb +36 -37
- data/lib/datadog/appsec/rate_limiter.rb +25 -40
- data/lib/datadog/appsec/remote.rb +7 -3
- data/lib/datadog/appsec.rb +2 -2
- data/lib/datadog/core/configuration/components.rb +4 -3
- data/lib/datadog/core/configuration/settings.rb +84 -5
- data/lib/datadog/core/crashtracking/component.rb +1 -1
- data/lib/datadog/core/environment/execution.rb +5 -5
- data/lib/datadog/core/metrics/client.rb +7 -0
- data/lib/datadog/core/rate_limiter.rb +183 -0
- data/lib/datadog/core/remote/client/capabilities.rb +4 -3
- data/lib/datadog/core/remote/component.rb +4 -2
- data/lib/datadog/core/remote/negotiation.rb +4 -4
- data/lib/datadog/core/remote/tie.rb +2 -0
- data/lib/datadog/core/runtime/metrics.rb +1 -1
- data/lib/datadog/core/telemetry/component.rb +2 -0
- data/lib/datadog/core/telemetry/event.rb +12 -7
- data/lib/datadog/core/telemetry/logger.rb +51 -0
- data/lib/datadog/core/telemetry/logging.rb +50 -14
- data/lib/datadog/core/telemetry/request.rb +13 -1
- data/lib/datadog/core/utils/time.rb +12 -0
- data/lib/datadog/di/code_tracker.rb +168 -0
- data/lib/datadog/di/configuration/settings.rb +163 -0
- data/lib/datadog/di/configuration.rb +11 -0
- data/lib/datadog/di/error.rb +31 -0
- data/lib/datadog/di/extensions.rb +16 -0
- data/lib/datadog/di/probe.rb +133 -0
- data/lib/datadog/di/probe_builder.rb +41 -0
- data/lib/datadog/di/redactor.rb +188 -0
- data/lib/datadog/di/serializer.rb +193 -0
- data/lib/datadog/di.rb +14 -0
- data/lib/datadog/opentelemetry/sdk/propagator.rb +2 -0
- data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +12 -10
- data/lib/datadog/profiling/collectors/info.rb +12 -3
- data/lib/datadog/profiling/collectors/thread_context.rb +26 -0
- data/lib/datadog/profiling/component.rb +20 -4
- data/lib/datadog/profiling/http_transport.rb +6 -1
- data/lib/datadog/profiling/scheduler.rb +2 -0
- data/lib/datadog/profiling/stack_recorder.rb +3 -0
- data/lib/datadog/single_step_instrument.rb +12 -0
- data/lib/datadog/tracing/contrib/action_cable/instrumentation.rb +8 -12
- data/lib/datadog/tracing/contrib/action_pack/action_controller/instrumentation.rb +5 -0
- data/lib/datadog/tracing/contrib/action_pack/action_dispatch/instrumentation.rb +78 -0
- data/lib/datadog/tracing/contrib/action_pack/action_dispatch/patcher.rb +33 -0
- data/lib/datadog/tracing/contrib/action_pack/patcher.rb +2 -0
- data/lib/datadog/tracing/contrib/active_record/configuration/resolver.rb +4 -0
- data/lib/datadog/tracing/contrib/active_record/events/instantiation.rb +3 -1
- data/lib/datadog/tracing/contrib/active_record/events/sql.rb +3 -1
- data/lib/datadog/tracing/contrib/active_support/cache/events/cache.rb +5 -1
- data/lib/datadog/tracing/contrib/aws/instrumentation.rb +5 -0
- data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +6 -1
- data/lib/datadog/tracing/contrib/faraday/middleware.rb +9 -0
- data/lib/datadog/tracing/contrib/grape/endpoint.rb +19 -0
- data/lib/datadog/tracing/contrib/graphql/patcher.rb +9 -12
- data/lib/datadog/tracing/contrib/graphql/trace_patcher.rb +3 -3
- data/lib/datadog/tracing/contrib/graphql/tracing_patcher.rb +3 -3
- data/lib/datadog/tracing/contrib/graphql/unified_trace.rb +13 -9
- data/lib/datadog/tracing/contrib/graphql/unified_trace_patcher.rb +6 -3
- data/lib/datadog/tracing/contrib/http/instrumentation.rb +18 -15
- data/lib/datadog/tracing/contrib/httpclient/instrumentation.rb +6 -5
- data/lib/datadog/tracing/contrib/httpclient/patcher.rb +1 -14
- data/lib/datadog/tracing/contrib/httprb/instrumentation.rb +5 -0
- data/lib/datadog/tracing/contrib/httprb/patcher.rb +1 -14
- data/lib/datadog/tracing/contrib/lograge/patcher.rb +1 -2
- data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +2 -0
- data/lib/datadog/tracing/contrib/opensearch/patcher.rb +13 -6
- data/lib/datadog/tracing/contrib/patcher.rb +2 -1
- data/lib/datadog/tracing/contrib/presto/patcher.rb +1 -13
- data/lib/datadog/tracing/contrib/rack/middlewares.rb +27 -0
- data/lib/datadog/tracing/contrib/redis/tags.rb +4 -0
- data/lib/datadog/tracing/contrib/sinatra/tracer.rb +4 -0
- data/lib/datadog/tracing/contrib/stripe/request.rb +3 -2
- data/lib/datadog/tracing/distributed/propagation.rb +7 -0
- data/lib/datadog/tracing/metadata/ext.rb +2 -0
- data/lib/datadog/tracing/remote.rb +5 -2
- data/lib/datadog/tracing/sampling/matcher.rb +6 -1
- data/lib/datadog/tracing/sampling/rate_sampler.rb +1 -1
- data/lib/datadog/tracing/sampling/rule.rb +2 -0
- data/lib/datadog/tracing/sampling/rule_sampler.rb +9 -5
- data/lib/datadog/tracing/sampling/span/ext.rb +1 -1
- data/lib/datadog/tracing/sampling/span/rule.rb +2 -2
- data/lib/datadog/tracing/trace_operation.rb +26 -2
- data/lib/datadog/tracing/tracer.rb +14 -12
- data/lib/datadog/tracing/transport/http/client.rb +1 -0
- data/lib/datadog/tracing/transport/io/client.rb +1 -0
- data/lib/datadog/tracing/workers/trace_writer.rb +1 -1
- data/lib/datadog/tracing/workers.rb +1 -1
- data/lib/datadog/version.rb +1 -1
- metadata +25 -8
- data/lib/datadog/tracing/sampling/rate_limiter.rb +0 -185
@@ -76,6 +76,11 @@
|
|
76
76
|
#define MISSING_TRACER_CONTEXT_KEY 0
|
77
77
|
#define TIME_BETWEEN_GC_EVENTS_NS MILLIS_AS_NS(10)
|
78
78
|
|
79
|
+
// This is used as a placeholder to mark threads that are allowed to be profiled (enabled)
|
80
|
+
// (e.g. to avoid trying to gvl profile threads that are not from the main Ractor)
|
81
|
+
// and for which there's no data yet
|
82
|
+
#define GVL_WAITING_ENABLED_EMPTY RUBY_FIXNUM_MAX
|
83
|
+
|
79
84
|
static ID at_active_span_id; // id of :@active_span in Ruby
|
80
85
|
static ID at_active_trace_id; // id of :@active_trace in Ruby
|
81
86
|
static ID at_id_id; // id of :@id in Ruby
|
@@ -86,6 +91,26 @@ static ID at_otel_values_id; // id of :@otel_values in Ruby
|
|
86
91
|
static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby
|
87
92
|
static ID at_datadog_trace_id; // id of :@datadog_trace in Ruby
|
88
93
|
|
94
|
+
// Used to support reading trace identifiers from the opentelemetry Ruby library when the ddtrace gem tracing
|
95
|
+
// integration is NOT in use.
|
96
|
+
static ID at_span_id_id; // id of :@span_id in Ruby
|
97
|
+
static ID at_trace_id_id; // id of :@trace_id in Ruby
|
98
|
+
static ID at_entries_id; // id of :@entries in Ruby
|
99
|
+
static ID at_context_id; // id of :@context in Ruby
|
100
|
+
static ID at_kind_id; // id of :@kind in Ruby
|
101
|
+
static ID at_name_id; // id of :@name in Ruby
|
102
|
+
static ID server_id; // id of :server in Ruby
|
103
|
+
static ID otel_context_storage_id; // id of :__opentelemetry_context_storage__ in Ruby
|
104
|
+
|
105
|
+
// This is used by `thread_context_collector_on_gvl_running`. Because when that method gets called we're not sure if
|
106
|
+
// it's safe to access the state of the thread context collector, we store this setting as a global value. This does
|
107
|
+
// mean this setting is shared among all thread context collectors, and thus it's "last writer wins".
|
108
|
+
// In production this should not be a problem: there should only be one profiler, which is the last one created,
|
109
|
+
// and that'll be the one that last wrote this setting.
|
110
|
+
static uint32_t global_waiting_for_gvl_threshold_ns = MILLIS_AS_NS(10);
|
111
|
+
|
112
|
+
enum otel_context_enabled {otel_context_enabled_false, otel_context_enabled_only, otel_context_enabled_both};
|
113
|
+
|
89
114
|
// Contains state for a single ThreadContext instance
|
90
115
|
struct thread_context_collector_state {
|
91
116
|
// Note: Places in this file that usually need to be changed when this struct is changed are tagged with
|
@@ -112,6 +137,8 @@ struct thread_context_collector_state {
|
|
112
137
|
bool endpoint_collection_enabled;
|
113
138
|
// Used to omit timestamps / timeline events from collected data
|
114
139
|
bool timeline_enabled;
|
140
|
+
// Used to control context collection
|
141
|
+
enum otel_context_enabled otel_context_enabled;
|
115
142
|
// Used to omit class information from collected allocation data
|
116
143
|
bool allocation_type_enabled;
|
117
144
|
// Used when calling monotonic_to_system_epoch_ns
|
@@ -119,6 +146,8 @@ struct thread_context_collector_state {
|
|
119
146
|
// Used to identify the main thread, to give it a fallback name
|
120
147
|
VALUE main_thread;
|
121
148
|
// Used when extracting trace identifiers from otel spans. Lazily initialized.
|
149
|
+
// Qtrue serves as a marker we've not yet extracted it; when we try to extract it, we set it to an object if
|
150
|
+
// successful and Qnil if not.
|
122
151
|
VALUE otel_current_span_key;
|
123
152
|
|
124
153
|
struct stats {
|
@@ -164,6 +193,12 @@ struct trace_identifiers {
|
|
164
193
|
VALUE trace_endpoint;
|
165
194
|
};
|
166
195
|
|
196
|
+
struct otel_span {
|
197
|
+
VALUE span;
|
198
|
+
VALUE span_id;
|
199
|
+
VALUE trace_id;
|
200
|
+
};
|
201
|
+
|
167
202
|
static void thread_context_collector_typed_data_mark(void *state_ptr);
|
168
203
|
static void thread_context_collector_typed_data_free(void *state_ptr);
|
169
204
|
static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _value, st_data_t _argument);
|
@@ -177,13 +212,15 @@ static VALUE _native_initialize(
|
|
177
212
|
VALUE tracer_context_key,
|
178
213
|
VALUE endpoint_collection_enabled,
|
179
214
|
VALUE timeline_enabled,
|
215
|
+
VALUE waiting_for_gvl_threshold_ns,
|
216
|
+
VALUE otel_context_enabled,
|
180
217
|
VALUE allocation_type_enabled
|
181
218
|
);
|
182
219
|
static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
|
183
220
|
static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
|
184
221
|
static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
|
185
|
-
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
|
186
|
-
void update_metrics_and_sample(
|
222
|
+
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state);
|
223
|
+
static void update_metrics_and_sample(
|
187
224
|
struct thread_context_collector_state *state,
|
188
225
|
VALUE thread_being_sampled,
|
189
226
|
VALUE stack_from_thread,
|
@@ -201,7 +238,8 @@ static void trigger_sample_for_thread(
|
|
201
238
|
sample_values values,
|
202
239
|
long current_monotonic_wall_time_ns,
|
203
240
|
ddog_CharSlice *ruby_vm_type,
|
204
|
-
ddog_CharSlice *class_name
|
241
|
+
ddog_CharSlice *class_name,
|
242
|
+
bool is_gvl_waiting_state
|
205
243
|
);
|
206
244
|
static VALUE _native_thread_list(VALUE self);
|
207
245
|
static struct per_thread_context *get_or_create_context_for(VALUE thread, struct thread_context_collector_state *state);
|
@@ -237,6 +275,26 @@ static void ddtrace_otel_trace_identifiers_for(
|
|
237
275
|
VALUE otel_values
|
238
276
|
);
|
239
277
|
static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples);
|
278
|
+
static bool handle_gvl_waiting(
|
279
|
+
struct thread_context_collector_state *state,
|
280
|
+
VALUE thread_being_sampled,
|
281
|
+
VALUE stack_from_thread,
|
282
|
+
struct per_thread_context *thread_context,
|
283
|
+
sampling_buffer* sampling_buffer,
|
284
|
+
long current_cpu_time_ns
|
285
|
+
);
|
286
|
+
static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread);
|
287
|
+
static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread);
|
288
|
+
static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread);
|
289
|
+
static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread);
|
290
|
+
static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns);
|
291
|
+
static void otel_without_ddtrace_trace_identifiers_for(
|
292
|
+
struct thread_context_collector_state *state,
|
293
|
+
VALUE thread,
|
294
|
+
struct trace_identifiers *trace_identifiers_result
|
295
|
+
);
|
296
|
+
static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key);
|
297
|
+
static uint64_t otel_span_id_to_uint(VALUE otel_span_id);
|
240
298
|
|
241
299
|
void collectors_thread_context_init(VALUE profiling_module) {
|
242
300
|
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
|
@@ -254,20 +312,27 @@ void collectors_thread_context_init(VALUE profiling_module) {
|
|
254
312
|
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
|
255
313
|
rb_define_alloc_func(collectors_thread_context_class, _native_new);
|
256
314
|
|
257
|
-
rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize,
|
315
|
+
rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, 9);
|
258
316
|
rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1);
|
259
317
|
rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
|
260
318
|
rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2);
|
261
319
|
rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3);
|
262
320
|
rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
|
263
321
|
rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
|
264
|
-
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc,
|
322
|
+
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2);
|
265
323
|
rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0);
|
266
324
|
rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
|
267
325
|
rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);
|
268
326
|
rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1);
|
269
327
|
rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0);
|
270
328
|
rb_define_singleton_method(testing_module, "_native_sample_skipped_allocation_samples", _native_sample_skipped_allocation_samples, 2);
|
329
|
+
#ifndef NO_GVL_INSTRUMENTATION
|
330
|
+
rb_define_singleton_method(testing_module, "_native_on_gvl_waiting", _native_on_gvl_waiting, 1);
|
331
|
+
rb_define_singleton_method(testing_module, "_native_gvl_waiting_at_for", _native_gvl_waiting_at_for, 1);
|
332
|
+
rb_define_singleton_method(testing_module, "_native_on_gvl_running", _native_on_gvl_running, 1);
|
333
|
+
rb_define_singleton_method(testing_module, "_native_sample_after_gvl_running", _native_sample_after_gvl_running, 2);
|
334
|
+
rb_define_singleton_method(testing_module, "_native_apply_delta_to_cpu_time_at_previous_sample_ns", _native_apply_delta_to_cpu_time_at_previous_sample_ns, 3);
|
335
|
+
#endif
|
271
336
|
|
272
337
|
at_active_span_id = rb_intern_const("@active_span");
|
273
338
|
at_active_trace_id = rb_intern_const("@active_trace");
|
@@ -278,6 +343,19 @@ void collectors_thread_context_init(VALUE profiling_module) {
|
|
278
343
|
at_otel_values_id = rb_intern_const("@otel_values");
|
279
344
|
at_parent_span_id_id = rb_intern_const("@parent_span_id");
|
280
345
|
at_datadog_trace_id = rb_intern_const("@datadog_trace");
|
346
|
+
at_span_id_id = rb_intern_const("@span_id");
|
347
|
+
at_trace_id_id = rb_intern_const("@trace_id");
|
348
|
+
at_entries_id = rb_intern_const("@entries");
|
349
|
+
at_context_id = rb_intern_const("@context");
|
350
|
+
at_kind_id = rb_intern_const("@kind");
|
351
|
+
at_name_id = rb_intern_const("@name");
|
352
|
+
server_id = rb_intern_const("server");
|
353
|
+
otel_context_storage_id = rb_intern_const("__opentelemetry_context_storage__");
|
354
|
+
|
355
|
+
#ifndef NO_GVL_INSTRUMENTATION
|
356
|
+
// This will raise if Ruby already ran out of thread-local keys
|
357
|
+
gvl_profiling_init();
|
358
|
+
#endif
|
281
359
|
|
282
360
|
gc_profiling_init();
|
283
361
|
}
|
@@ -357,11 +435,12 @@ static VALUE _native_new(VALUE klass) {
|
|
357
435
|
state->thread_list_buffer = thread_list_buffer;
|
358
436
|
state->endpoint_collection_enabled = true;
|
359
437
|
state->timeline_enabled = true;
|
438
|
+
state->otel_context_enabled = otel_context_enabled_false;
|
360
439
|
state->allocation_type_enabled = true;
|
361
440
|
state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
|
362
441
|
VALUE main_thread = rb_thread_main();
|
363
442
|
state->main_thread = main_thread;
|
364
|
-
state->otel_current_span_key =
|
443
|
+
state->otel_current_span_key = Qtrue;
|
365
444
|
state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME;
|
366
445
|
state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = 0;
|
367
446
|
|
@@ -377,6 +456,7 @@ static VALUE _native_new(VALUE klass) {
|
|
377
456
|
return instance;
|
378
457
|
}
|
379
458
|
|
459
|
+
// TODO: Convert this to use options like CpuAndWallTimeWorker
|
380
460
|
static VALUE _native_initialize(
|
381
461
|
DDTRACE_UNUSED VALUE _self,
|
382
462
|
VALUE collector_instance,
|
@@ -385,10 +465,13 @@ static VALUE _native_initialize(
|
|
385
465
|
VALUE tracer_context_key,
|
386
466
|
VALUE endpoint_collection_enabled,
|
387
467
|
VALUE timeline_enabled,
|
468
|
+
VALUE waiting_for_gvl_threshold_ns,
|
469
|
+
VALUE otel_context_enabled,
|
388
470
|
VALUE allocation_type_enabled
|
389
471
|
) {
|
390
472
|
ENFORCE_BOOLEAN(endpoint_collection_enabled);
|
391
473
|
ENFORCE_BOOLEAN(timeline_enabled);
|
474
|
+
ENFORCE_TYPE(waiting_for_gvl_threshold_ns, T_FIXNUM);
|
392
475
|
ENFORCE_BOOLEAN(allocation_type_enabled);
|
393
476
|
|
394
477
|
struct thread_context_collector_state *state;
|
@@ -401,8 +484,19 @@ static VALUE _native_initialize(
|
|
401
484
|
state->recorder_instance = enforce_recorder_instance(recorder_instance);
|
402
485
|
state->endpoint_collection_enabled = (endpoint_collection_enabled == Qtrue);
|
403
486
|
state->timeline_enabled = (timeline_enabled == Qtrue);
|
487
|
+
if (otel_context_enabled == Qfalse || otel_context_enabled == Qnil) {
|
488
|
+
state->otel_context_enabled = otel_context_enabled_false;
|
489
|
+
} else if (otel_context_enabled == ID2SYM(rb_intern("only"))) {
|
490
|
+
state->otel_context_enabled = otel_context_enabled_only;
|
491
|
+
} else if (otel_context_enabled == ID2SYM(rb_intern("both"))) {
|
492
|
+
state->otel_context_enabled = otel_context_enabled_both;
|
493
|
+
} else {
|
494
|
+
rb_raise(rb_eArgError, "Unexpected value for otel_context_enabled: %+" PRIsVALUE, otel_context_enabled);
|
495
|
+
}
|
404
496
|
state->allocation_type_enabled = (allocation_type_enabled == Qtrue);
|
405
497
|
|
498
|
+
global_waiting_for_gvl_threshold_ns = NUM2UINT(waiting_for_gvl_threshold_ns);
|
499
|
+
|
406
500
|
if (RTEST(tracer_context_key)) {
|
407
501
|
ENFORCE_TYPE(tracer_context_key, T_SYMBOL);
|
408
502
|
// Note about rb_to_id and dynamic symbols: calling `rb_to_id` prevents symbols from ever being garbage collected.
|
@@ -433,13 +527,22 @@ static VALUE _native_on_gc_start(DDTRACE_UNUSED VALUE self, VALUE collector_inst
|
|
433
527
|
// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
|
434
528
|
// It SHOULD NOT be used for other purposes.
|
435
529
|
static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
|
436
|
-
thread_context_collector_on_gc_finish(collector_instance);
|
530
|
+
(void) !thread_context_collector_on_gc_finish(collector_instance);
|
437
531
|
return Qtrue;
|
438
532
|
}
|
439
533
|
|
440
534
|
// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
|
441
535
|
// It SHOULD NOT be used for other purposes.
|
442
|
-
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
|
536
|
+
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) {
|
537
|
+
ENFORCE_BOOLEAN(reset_monotonic_to_system_state);
|
538
|
+
|
539
|
+
struct thread_context_collector_state *state;
|
540
|
+
TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
|
541
|
+
|
542
|
+
if (reset_monotonic_to_system_state == Qtrue) {
|
543
|
+
state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
|
544
|
+
}
|
545
|
+
|
443
546
|
thread_context_collector_sample_after_gc(collector_instance);
|
444
547
|
return Qtrue;
|
445
548
|
}
|
@@ -502,7 +605,7 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
|
|
502
605
|
);
|
503
606
|
}
|
504
607
|
|
505
|
-
void update_metrics_and_sample(
|
608
|
+
static void update_metrics_and_sample(
|
506
609
|
struct thread_context_collector_state *state,
|
507
610
|
VALUE thread_being_sampled,
|
508
611
|
VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
|
@@ -511,12 +614,17 @@ void update_metrics_and_sample(
|
|
511
614
|
long current_cpu_time_ns,
|
512
615
|
long current_monotonic_wall_time_ns
|
513
616
|
) {
|
514
|
-
|
617
|
+
bool is_gvl_waiting_state =
|
618
|
+
handle_gvl_waiting(state, thread_being_sampled, stack_from_thread, thread_context, sampling_buffer, current_cpu_time_ns);
|
619
|
+
|
620
|
+
// Don't assign/update cpu during "Waiting for GVL"
|
621
|
+
long cpu_time_elapsed_ns = is_gvl_waiting_state ? 0 : update_time_since_previous_sample(
|
515
622
|
&thread_context->cpu_time_at_previous_sample_ns,
|
516
623
|
current_cpu_time_ns,
|
517
624
|
thread_context->gc_tracking.cpu_time_at_start_ns,
|
518
625
|
IS_NOT_WALL_TIME
|
519
626
|
);
|
627
|
+
|
520
628
|
long wall_time_elapsed_ns = update_time_since_previous_sample(
|
521
629
|
&thread_context->wall_time_at_previous_sample_ns,
|
522
630
|
current_monotonic_wall_time_ns,
|
@@ -528,6 +636,21 @@ void update_metrics_and_sample(
|
|
528
636
|
IS_WALL_TIME
|
529
637
|
);
|
530
638
|
|
639
|
+
// A thread enters "Waiting for GVL", well, as the name implies, without the GVL.
|
640
|
+
//
|
641
|
+
// As a consequence, it's possible that a thread enters "Waiting for GVL" in parallel with the current thread working
|
642
|
+
// on sampling, and thus for the `current_monotonic_wall_time_ns` (which is recorded at the start of sampling)
|
643
|
+
// to be < the time at which we started Waiting for GVL.
|
644
|
+
//
|
645
|
+
// All together, this means that when `handle_gvl_waiting` creates an extra sample (see comments on that function for
|
646
|
+
// what the extra sample is), it's possible that there's no more wall-time to be assigned.
|
647
|
+
// Thus, in this case, we don't want to produce a sample representing Waiting for GVL with a wall-time of 0, and
|
648
|
+
// thus we skip creating such a sample.
|
649
|
+
if (is_gvl_waiting_state && wall_time_elapsed_ns == 0) return;
|
650
|
+
// ...you may also wonder: is there any other situation where it makes sense to produce a sample with
|
651
|
+
// wall_time_elapsed_ns == 0? I believe that yes, because the sample still includes a timestamp and a stack, but we
|
652
|
+
// may revisit/change our minds on this in the future.
|
653
|
+
|
531
654
|
trigger_sample_for_thread(
|
532
655
|
state,
|
533
656
|
thread_being_sampled,
|
@@ -537,7 +660,8 @@ void update_metrics_and_sample(
|
|
537
660
|
(sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns},
|
538
661
|
current_monotonic_wall_time_ns,
|
539
662
|
NULL,
|
540
|
-
NULL
|
663
|
+
NULL,
|
664
|
+
is_gvl_waiting_state
|
541
665
|
);
|
542
666
|
}
|
543
667
|
|
@@ -583,6 +707,7 @@ void thread_context_collector_on_gc_start(VALUE self_instance) {
|
|
583
707
|
//
|
584
708
|
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
|
585
709
|
// Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
|
710
|
+
__attribute__((warn_unused_result))
|
586
711
|
bool thread_context_collector_on_gc_finish(VALUE self_instance) {
|
587
712
|
struct thread_context_collector_state *state;
|
588
713
|
if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return false;
|
@@ -718,7 +843,8 @@ static void trigger_sample_for_thread(
|
|
718
843
|
long current_monotonic_wall_time_ns,
|
719
844
|
// These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
|
720
845
|
ddog_CharSlice *ruby_vm_type,
|
721
|
-
ddog_CharSlice *class_name
|
846
|
+
ddog_CharSlice *class_name,
|
847
|
+
bool is_gvl_waiting_state
|
722
848
|
) {
|
723
849
|
int max_label_count =
|
724
850
|
1 + // thread id
|
@@ -759,6 +885,11 @@ static void trigger_sample_for_thread(
|
|
759
885
|
struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
|
760
886
|
trace_identifiers_for(state, thread, &trace_identifiers_result);
|
761
887
|
|
888
|
+
if (!trace_identifiers_result.valid && state->otel_context_enabled != otel_context_enabled_false) {
|
889
|
+
// If we couldn't get something with ddtrace, let's see if we can get some trace identifiers from opentelemetry directly
|
890
|
+
otel_without_ddtrace_trace_identifiers_for(state, thread, &trace_identifiers_result);
|
891
|
+
}
|
892
|
+
|
762
893
|
if (trace_identifiers_result.valid) {
|
763
894
|
labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("local root span id"), .num = trace_identifiers_result.local_root_span_id};
|
764
895
|
labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("span id"), .num = trace_identifiers_result.span_id};
|
@@ -837,7 +968,12 @@ static void trigger_sample_for_thread(
|
|
837
968
|
sampling_buffer,
|
838
969
|
state->recorder_instance,
|
839
970
|
values,
|
840
|
-
(sample_labels) {
|
971
|
+
(sample_labels) {
|
972
|
+
.labels = slice_labels,
|
973
|
+
.state_label = state_label,
|
974
|
+
.end_timestamp_ns = end_timestamp_ns,
|
975
|
+
.is_gvl_waiting_state = is_gvl_waiting_state,
|
976
|
+
}
|
841
977
|
);
|
842
978
|
}
|
843
979
|
|
@@ -887,9 +1023,9 @@ static struct per_thread_context *get_context_for(VALUE thread, struct thread_co
|
|
887
1023
|
// to either run Ruby code during sampling (not great), or otherwise use some of the VM private APIs to detect this.
|
888
1024
|
//
|
889
1025
|
static bool is_logging_gem_monkey_patch(VALUE invoke_file_location) {
|
890
|
-
|
1026
|
+
unsigned long logging_gem_path_len = strlen(LOGGING_GEM_PATH);
|
891
1027
|
char *invoke_file = StringValueCStr(invoke_file_location);
|
892
|
-
|
1028
|
+
unsigned long invoke_file_len = strlen(invoke_file);
|
893
1029
|
|
894
1030
|
if (invoke_file_len < logging_gem_path_len) return false;
|
895
1031
|
|
@@ -937,6 +1073,20 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c
|
|
937
1073
|
// These will only be used during a GC operation
|
938
1074
|
thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME;
|
939
1075
|
thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME;
|
1076
|
+
|
1077
|
+
#ifndef NO_GVL_INSTRUMENTATION
|
1078
|
+
// We use this special location to store data that can be accessed without any
|
1079
|
+
// kind of synchronization (e.g. by threads without the GVL).
|
1080
|
+
//
|
1081
|
+
// We set this marker here for two purposes:
|
1082
|
+
// * To make sure there's no stale data from a previous execution of the profiler.
|
1083
|
+
// * To mark threads that are actually being profiled
|
1084
|
+
//
|
1085
|
+
// (Setting this is potentially a race, but what we want is to avoid _stale_ data, so
|
1086
|
+
// if this gets set concurrently with context initialization, then such a value will belong
|
1087
|
+
// to the current profiler instance, so that's OK)
|
1088
|
+
gvl_profiling_state_thread_object_set(thread, GVL_WAITING_ENABLED_EMPTY);
|
1089
|
+
#endif
|
940
1090
|
}
|
941
1091
|
|
942
1092
|
static void free_context(struct per_thread_context* thread_context) {
|
@@ -960,6 +1110,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
|
|
960
1110
|
rb_str_concat(result, rb_sprintf(" stats=%"PRIsVALUE, stats_as_ruby_hash(state)));
|
961
1111
|
rb_str_concat(result, rb_sprintf(" endpoint_collection_enabled=%"PRIsVALUE, state->endpoint_collection_enabled ? Qtrue : Qfalse));
|
962
1112
|
rb_str_concat(result, rb_sprintf(" timeline_enabled=%"PRIsVALUE, state->timeline_enabled ? Qtrue : Qfalse));
|
1113
|
+
rb_str_concat(result, rb_sprintf(" otel_context_enabled=%d", state->otel_context_enabled));
|
963
1114
|
rb_str_concat(result, rb_sprintf(" allocation_type_enabled=%"PRIsVALUE, state->allocation_type_enabled ? Qtrue : Qfalse));
|
964
1115
|
rb_str_concat(result, rb_sprintf(
|
965
1116
|
" time_converter_state={.system_epoch_ns_reference=%ld, .delta_to_epoch_ns=%ld}",
|
@@ -969,6 +1120,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
|
|
969
1120
|
rb_str_concat(result, rb_sprintf(" main_thread=%"PRIsVALUE, state->main_thread));
|
970
1121
|
rb_str_concat(result, rb_sprintf(" gc_tracking=%"PRIsVALUE, gc_tracking_as_ruby_hash(state)));
|
971
1122
|
rb_str_concat(result, rb_sprintf(" otel_current_span_key=%"PRIsVALUE, state->otel_current_span_key));
|
1123
|
+
rb_str_concat(result, rb_sprintf(" global_waiting_for_gvl_threshold_ns=%u", global_waiting_for_gvl_threshold_ns));
|
972
1124
|
|
973
1125
|
return result;
|
974
1126
|
}
|
@@ -996,6 +1148,10 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value
|
|
996
1148
|
|
997
1149
|
ID2SYM(rb_intern("gc_tracking.cpu_time_at_start_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_start_ns),
|
998
1150
|
ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns),
|
1151
|
+
|
1152
|
+
#ifndef NO_GVL_INSTRUMENTATION
|
1153
|
+
ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM(gvl_profiling_state_thread_object_get(thread)),
|
1154
|
+
#endif
|
999
1155
|
};
|
1000
1156
|
for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]);
|
1001
1157
|
|
@@ -1146,6 +1302,7 @@ static VALUE _native_gc_tracking(DDTRACE_UNUSED VALUE _self, VALUE collector_ins
|
|
1146
1302
|
|
1147
1303
|
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
|
1148
1304
|
static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result) {
|
1305
|
+
if (state->otel_context_enabled == otel_context_enabled_only) return;
|
1149
1306
|
if (state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY) return;
|
1150
1307
|
|
1151
1308
|
VALUE current_context = rb_thread_local_aref(thread, state->tracer_context_key);
|
@@ -1200,7 +1357,7 @@ static bool should_collect_resource(VALUE root_span) {
|
|
1200
1357
|
if (root_span_type == Qnil) return false;
|
1201
1358
|
ENFORCE_TYPE(root_span_type, T_STRING);
|
1202
1359
|
|
1203
|
-
|
1360
|
+
long root_span_type_length = RSTRING_LEN(root_span_type);
|
1204
1361
|
const char *root_span_type_value = StringValuePtr(root_span_type);
|
1205
1362
|
|
1206
1363
|
bool is_web_request =
|
@@ -1223,6 +1380,9 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector
|
|
1223
1380
|
struct thread_context_collector_state *state;
|
1224
1381
|
TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
|
1225
1382
|
|
1383
|
+
// Release all context memory before clearing the existing context
|
1384
|
+
st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */);
|
1385
|
+
|
1226
1386
|
st_clear(state->hash_map_per_thread_context);
|
1227
1387
|
|
1228
1388
|
state->stats = (struct stats) {}; // Resets all stats back to zero
|
@@ -1326,7 +1486,8 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
|
|
1326
1486
|
(sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true},
|
1327
1487
|
INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
|
1328
1488
|
&ruby_vm_type,
|
1329
|
-
optional_class_name
|
1489
|
+
optional_class_name,
|
1490
|
+
false
|
1330
1491
|
);
|
1331
1492
|
}
|
1332
1493
|
|
@@ -1372,25 +1533,29 @@ static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
|
|
1372
1533
|
}
|
1373
1534
|
}
|
1374
1535
|
|
1536
|
+
// Used to access OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY). Will raise exceptions if it fails.
|
1537
|
+
static VALUE read_otel_current_span_key_const(DDTRACE_UNUSED VALUE _unused) {
|
1538
|
+
VALUE opentelemetry_module = rb_const_get(rb_cObject, rb_intern("OpenTelemetry"));
|
1539
|
+
ENFORCE_TYPE(opentelemetry_module, T_MODULE);
|
1540
|
+
VALUE trace_module = rb_const_get(opentelemetry_module, rb_intern("Trace"));
|
1541
|
+
ENFORCE_TYPE(trace_module, T_MODULE);
|
1542
|
+
return rb_const_get(trace_module, rb_intern("CURRENT_SPAN_KEY"));
|
1543
|
+
}
|
1544
|
+
|
1375
1545
|
static VALUE get_otel_current_span_key(struct thread_context_collector_state *state) {
|
1376
|
-
if (state->otel_current_span_key ==
|
1377
|
-
|
1378
|
-
|
1379
|
-
VALUE
|
1380
|
-
VALUE context_module = rb_const_get(api_module, rb_intern_const("Context"));
|
1381
|
-
VALUE current_span_key = rb_const_get(context_module, rb_intern_const("CURRENT_SPAN_KEY"));
|
1382
|
-
|
1383
|
-
if (current_span_key == Qnil) {
|
1384
|
-
rb_raise(rb_eRuntimeError, "Unexpected: Missing Datadog::OpenTelemetry::API::Context::CURRENT_SPAN_KEY");
|
1385
|
-
}
|
1546
|
+
if (state->otel_current_span_key == Qtrue) { // Qtrue means we haven't tried to extract it yet
|
1547
|
+
// If this fails, we want to fail gracefully, rather than raise an exception (e.g. if the opentelemetry gem
|
1548
|
+
// gets refactored, we should not fall on our face)
|
1549
|
+
VALUE span_key = rb_protect(read_otel_current_span_key_const, Qnil, NULL);
|
1386
1550
|
|
1387
|
-
|
1551
|
+
// Note that this gets set to Qnil if we failed to extract the correct value, and thus we won't try to extract it again
|
1552
|
+
state->otel_current_span_key = span_key;
|
1388
1553
|
}
|
1389
1554
|
|
1390
1555
|
return state->otel_current_span_key;
|
1391
1556
|
}
|
1392
1557
|
|
1393
|
-
// This method gets used when ddtrace is being used indirectly via the
|
1558
|
+
// This method gets used when ddtrace is being used indirectly via the opentelemetry APIs. Information gets stored slightly
|
1394
1559
|
// differently, and this codepath handles it.
|
1395
1560
|
static void ddtrace_otel_trace_identifiers_for(
|
1396
1561
|
struct thread_context_collector_state *state,
|
@@ -1410,6 +1575,7 @@ static void ddtrace_otel_trace_identifiers_for(
|
|
1410
1575
|
if (resolved_numeric_span_id == Qnil) return;
|
1411
1576
|
|
1412
1577
|
VALUE otel_current_span_key = get_otel_current_span_key(state);
|
1578
|
+
if (otel_current_span_key == Qnil) return;
|
1413
1579
|
VALUE current_trace = *active_trace;
|
1414
1580
|
|
1415
1581
|
// ddtrace uses a different structure when spans are created from otel, where each otel span will have a unique ddtrace
|
@@ -1462,3 +1628,388 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
|
|
1462
1628
|
thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples));
|
1463
1629
|
return Qtrue;
|
1464
1630
|
}
|
1631
|
+
|
1632
|
+
// This method differs from trace_identifiers_for/ddtrace_otel_trace_identifiers_for to support the situation where
|
1633
|
+
// the opentelemetry ruby library is being used for tracing AND the ddtrace tracing bits are not involved at all.
|
1634
|
+
//
|
1635
|
+
// Thus, in this case, we're directly reading from the opentelemetry stuff, which is different to how ddtrace tracing
|
1636
|
+
// does it.
|
1637
|
+
//
|
1638
|
+
// This is somewhat brittle: we're coupling on internal details of the opentelemetry gem to get what we need. In the
|
1639
|
+
// future maybe the otel ruby folks would be open to having a nice public way of getting this data that suits the
|
1640
|
+
// usecase of profilers.
|
1641
|
+
// Until then, the strategy below is to be extremely defensive, and if anything is out of place, we immediately return
|
1642
|
+
// and give up on getting trace data from opentelemetry. (Thus, worst case would be -- you upgrade opentelemetry and
|
1643
|
+
// profiling features relying on reading this data stop working, but you'll still get profiles and the app will be
|
1644
|
+
// otherwise undisturbed).
|
1645
|
+
//
|
1646
|
+
// Specifically, the way this works is:
|
1647
|
+
// 1. The latest entry in the opentelemetry context storage represents the current span (if any). We take the span id
|
1648
|
+
// and trace id from this span.
|
1649
|
+
// 2. To find the local root span id, we walk the context storage backwards from the current span, and find the earliest
|
1650
|
+
// entry in the context storage that has the same trace id as the current span; we use the found span as the local
|
1651
|
+
// root span id.
|
1652
|
+
// This matches the semantics of how ddtrace tracing creates a TraceOperation and assigns a local root span to it.
|
1653
|
+
static void otel_without_ddtrace_trace_identifiers_for(
|
1654
|
+
struct thread_context_collector_state *state,
|
1655
|
+
VALUE thread,
|
1656
|
+
struct trace_identifiers *trace_identifiers_result
|
1657
|
+
) {
|
1658
|
+
VALUE context_storage = rb_thread_local_aref(thread, otel_context_storage_id /* __opentelemetry_context_storage__ */);
|
1659
|
+
|
1660
|
+
// If it exists, context_storage is expected to be an Array[OpenTelemetry::Context]
|
1661
|
+
if (context_storage == Qnil || !RB_TYPE_P(context_storage, T_ARRAY)) return;
|
1662
|
+
|
1663
|
+
VALUE otel_current_span_key = get_otel_current_span_key(state);
|
1664
|
+
if (otel_current_span_key == Qnil) return;
|
1665
|
+
|
1666
|
+
int active_context_index = RARRAY_LEN(context_storage) - 1;
|
1667
|
+
if (active_context_index < 0) return;
|
1668
|
+
|
1669
|
+
struct otel_span active_span = otel_span_from(rb_ary_entry(context_storage, active_context_index), otel_current_span_key);
|
1670
|
+
if (active_span.span == Qnil) return;
|
1671
|
+
|
1672
|
+
struct otel_span local_root_span = active_span;
|
1673
|
+
|
1674
|
+
// Now find the oldest span starting from the active span that still has the same trace id as the active span
|
1675
|
+
for (int i = active_context_index - 1; i >= 0; i--) {
|
1676
|
+
struct otel_span checking_span = otel_span_from(rb_ary_entry(context_storage, i), otel_current_span_key);
|
1677
|
+
if (checking_span.span == Qnil) return;
|
1678
|
+
|
1679
|
+
if (rb_str_equal(active_span.trace_id, checking_span.trace_id) == Qfalse) break;
|
1680
|
+
|
1681
|
+
local_root_span = checking_span;
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
// Convert the span ids into uint64_t to match what the Datadog tracer does
|
1685
|
+
trace_identifiers_result->span_id = otel_span_id_to_uint(active_span.span_id);
|
1686
|
+
trace_identifiers_result->local_root_span_id = otel_span_id_to_uint(local_root_span.span_id);
|
1687
|
+
|
1688
|
+
if (trace_identifiers_result->span_id == 0 || trace_identifiers_result->local_root_span_id == 0) return;
|
1689
|
+
|
1690
|
+
trace_identifiers_result->valid = true;
|
1691
|
+
|
1692
|
+
if (!state->endpoint_collection_enabled) return;
|
1693
|
+
|
1694
|
+
VALUE root_span_type = rb_ivar_get(local_root_span.span, at_kind_id /* @kind */);
|
1695
|
+
// We filter out spans that don't have `kind: :server`
|
1696
|
+
if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return;
|
1697
|
+
|
1698
|
+
VALUE trace_resource = rb_ivar_get(local_root_span.span, at_name_id /* @name */);
|
1699
|
+
if (!RB_TYPE_P(trace_resource, T_STRING)) return;
|
1700
|
+
|
1701
|
+
trace_identifiers_result->trace_endpoint = trace_resource;
|
1702
|
+
}
|
1703
|
+
|
1704
|
+
static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key) {
|
1705
|
+
struct otel_span failed = {.span = Qnil, .span_id = Qnil, .trace_id = Qnil};
|
1706
|
+
|
1707
|
+
if (otel_context == Qnil) return failed;
|
1708
|
+
|
1709
|
+
VALUE context_entries = rb_ivar_get(otel_context, at_entries_id /* @entries */);
|
1710
|
+
if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed;
|
1711
|
+
|
1712
|
+
// If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span]
|
1713
|
+
VALUE span = rb_hash_lookup(context_entries, otel_current_span_key);
|
1714
|
+
if (span == Qnil) return failed;
|
1715
|
+
|
1716
|
+
// If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context)
|
1717
|
+
VALUE span_context = rb_ivar_get(span, at_context_id /* @context */);
|
1718
|
+
if (span_context == Qnil) return failed;
|
1719
|
+
|
1720
|
+
VALUE span_id = rb_ivar_get(span_context, at_span_id_id /* @span_id */);
|
1721
|
+
VALUE trace_id = rb_ivar_get(span_context, at_trace_id_id /* @trace_id */);
|
1722
|
+
if (span_id == Qnil || trace_id == Qnil || !RB_TYPE_P(span_id, T_STRING) || !RB_TYPE_P(trace_id, T_STRING)) return failed;
|
1723
|
+
|
1724
|
+
return (struct otel_span) {.span = span, .span_id = span_id, .trace_id = trace_id};
|
1725
|
+
}
|
1726
|
+
|
1727
|
+
// Otel span ids are represented as a big-endian 8-byte string
|
1728
|
+
static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
|
1729
|
+
if (!RB_TYPE_P(otel_span_id, T_STRING) || RSTRING_LEN(otel_span_id) != 8) { return 0; }
|
1730
|
+
|
1731
|
+
unsigned char *span_bytes = (unsigned char*) StringValuePtr(otel_span_id);
|
1732
|
+
|
1733
|
+
return \
|
1734
|
+
((uint64_t)span_bytes[0] << 56) |
|
1735
|
+
((uint64_t)span_bytes[1] << 48) |
|
1736
|
+
((uint64_t)span_bytes[2] << 40) |
|
1737
|
+
((uint64_t)span_bytes[3] << 32) |
|
1738
|
+
((uint64_t)span_bytes[4] << 24) |
|
1739
|
+
((uint64_t)span_bytes[5] << 16) |
|
1740
|
+
((uint64_t)span_bytes[6] << 8) |
|
1741
|
+
((uint64_t)span_bytes[7]);
|
1742
|
+
}
|
1743
|
+
|
1744
|
+
#ifndef NO_GVL_INSTRUMENTATION
|
1745
|
+
// This function can get called from outside the GVL and even on non-main Ractors
|
1746
|
+
void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread) {
|
1747
|
+
// Because this function gets called from a thread that is NOT holding the GVL, we avoid touching the
|
1748
|
+
// per-thread context directly.
|
1749
|
+
//
|
1750
|
+
// Instead, we ask Ruby to hold the data we need in Ruby's own special per-thread context area
|
1751
|
+
// that's thread-safe and built for this kind of use
|
1752
|
+
//
|
1753
|
+
// Also, this function can get called on the non-main Ractor. We deal with this by checking if the value in the context
|
1754
|
+
// is non-zero, since only `initialize_context` ever sets the value from 0 to non-zero for threads it sees.
|
1755
|
+
intptr_t thread_being_profiled = gvl_profiling_state_get(thread);
|
1756
|
+
if (!thread_being_profiled) return;
|
1757
|
+
|
1758
|
+
long current_monotonic_wall_time_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
|
1759
|
+
if (current_monotonic_wall_time_ns <= 0 || current_monotonic_wall_time_ns > GVL_WAITING_ENABLED_EMPTY) return;
|
1760
|
+
|
1761
|
+
gvl_profiling_state_set(thread, current_monotonic_wall_time_ns);
|
1762
|
+
}
|
1763
|
+
|
1764
|
+
// This function can get called from outside the GVL and even on non-main Ractors
|
1765
|
+
__attribute__((warn_unused_result))
|
1766
|
+
bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
|
1767
|
+
intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
|
1768
|
+
|
1769
|
+
// Thread was not being profiled / not waiting on gvl
|
1770
|
+
if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
|
1771
|
+
|
1772
|
+
// @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but haven't gotten the chance yet?
|
1773
|
+
if (gvl_waiting_at < 0) return true;
|
1774
|
+
|
1775
|
+
long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;
|
1776
|
+
|
1777
|
+
bool should_sample = waiting_for_gvl_duration_ns >= waiting_for_gvl_threshold_ns;
|
1778
|
+
|
1779
|
+
if (should_sample) {
|
1780
|
+
// We flip the gvl_waiting_at to negative to mark that the thread is now running and no longer waiting
|
1781
|
+
intptr_t gvl_waiting_at_is_now_running = -gvl_waiting_at;
|
1782
|
+
|
1783
|
+
gvl_profiling_state_set(thread, gvl_waiting_at_is_now_running);
|
1784
|
+
} else {
|
1785
|
+
// We decided not to sample. Let's mark the thread back to the initial "enabled but empty" state
|
1786
|
+
gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
|
1787
|
+
}
|
1788
|
+
|
1789
|
+
return should_sample;
|
1790
|
+
}
|
1791
|
+
|
1792
|
+
__attribute__((warn_unused_result))
|
1793
|
+
bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
|
1794
|
+
return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
|
1795
|
+
}
|
1796
|
+
|
1797
|
+
// Why does this method need to exist?
|
1798
|
+
//
|
1799
|
+
// You may be surprised to see that if we never call this function (from cpu_and_wall_time_worker), Waiting for GVL
|
1800
|
+
// samples will still show up.
|
1801
|
+
// This is because regular cpu/wall-time samples also use `update_metrics_and_sample` which will do the right thing
|
1802
|
+
// and push "Waiting for GVL" samples as needed.
|
1803
|
+
//
|
1804
|
+
// The reason this method needs to exist and be called very shortly after thread_context_collector_on_gvl_running
|
1805
|
+
// returning true is to ensure accuracy of both the timing and stack for the Waiting for GVL sample.
|
1806
|
+
//
|
1807
|
+
// Timing:
|
1808
|
+
// Because we currently only record the timestamp when the Waiting for GVL started and not when the Waiting for GVL ended,
|
1809
|
+
// we rely on pushing a sample as soon as possible when the Waiting for GVL ends so that the timestamp of the sample
|
1810
|
+
// actually matches when we stopped waiting.
|
1811
|
+
//
|
1812
|
+
// Stack:
|
1813
|
+
// If the thread starts working without the end of the Waiting for GVL sample, then by the time the thread is sampled
|
1814
|
+
// via the regular cpu/wall-time samples mechanism, the stack can be be inaccurate (e.g. does not correctly pinpoint
|
1815
|
+
// where the waiting happened).
|
1816
|
+
//
|
1817
|
+
// Arguably, the last sample after Waiting for GVL ended (when gvl_waiting_at < 0) should always come from this method
|
1818
|
+
// and not a regular cpu/wall-time sample BUT since all of these things are happening in parallel/concurrently I suspect
|
1819
|
+
// it's possible for a regular sample to kick in just before this one.
|
1820
|
+
//
|
1821
|
+
// ---
|
1822
|
+
//
|
1823
|
+
// NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
|
1824
|
+
// exists to enable testing.
|
1825
|
+
VALUE thread_context_collector_sample_after_gvl_running_with_thread(VALUE self_instance, VALUE current_thread) {
|
1826
|
+
struct thread_context_collector_state *state;
|
1827
|
+
TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
|
1828
|
+
|
1829
|
+
if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "GVL profiling requires timeline to be enabled");
|
1830
|
+
|
1831
|
+
intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread);
|
1832
|
+
|
1833
|
+
if (gvl_waiting_at >= 0) {
|
1834
|
+
// @ivoanjo: I'm not sure if this can ever happen. This means that we're not on the same thread
|
1835
|
+
// that ran `thread_context_collector_on_gvl_running` and made the decision to sample OR a regular sample was
|
1836
|
+
// triggered ahead of us.
|
1837
|
+
// We do nothing in this case.
|
1838
|
+
return Qfalse;
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
|
1842
|
+
|
1843
|
+
// We don't actually account for cpu-time during Waiting for GVL. BUT, we may chose to push an
|
1844
|
+
// extra sample to represent the period prior to Waiting for GVL. To support that, we retrieve the current
|
1845
|
+
// cpu-time of the thread and let `update_metrics_and_sample` decide what to do with it.
|
1846
|
+
long cpu_time_for_thread = cpu_time_now_ns(thread_context);
|
1847
|
+
|
1848
|
+
// TODO: Should we update the dynamic sampling rate overhead tracking with this sample as well?
|
1849
|
+
|
1850
|
+
update_metrics_and_sample(
|
1851
|
+
state,
|
1852
|
+
/* thread_being_sampled: */ current_thread,
|
1853
|
+
/* stack_from_thread: */ current_thread,
|
1854
|
+
thread_context,
|
1855
|
+
thread_context->sampling_buffer,
|
1856
|
+
cpu_time_for_thread,
|
1857
|
+
monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
|
1858
|
+
);
|
1859
|
+
|
1860
|
+
return Qtrue; // To allow this to be called from rb_rescue2
|
1861
|
+
}
|
1862
|
+
|
1863
|
+
VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
|
1864
|
+
return thread_context_collector_sample_after_gvl_running_with_thread(self_instance, rb_thread_current());
|
1865
|
+
}
|
1866
|
+
|
1867
|
+
// This method is intended to be called from update_metrics_and_sample. It exists to handle extra sampling steps we
|
1868
|
+
// need to take when sampling cpu/wall-time for a thread that's in the "Waiting for GVL" state.
|
1869
|
+
__attribute__((warn_unused_result))
|
1870
|
+
static bool handle_gvl_waiting(
|
1871
|
+
struct thread_context_collector_state *state,
|
1872
|
+
VALUE thread_being_sampled,
|
1873
|
+
VALUE stack_from_thread,
|
1874
|
+
struct per_thread_context *thread_context,
|
1875
|
+
sampling_buffer* sampling_buffer,
|
1876
|
+
long current_cpu_time_ns
|
1877
|
+
) {
|
1878
|
+
intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread_being_sampled);
|
1879
|
+
|
1880
|
+
bool is_gvl_waiting_state = gvl_waiting_at != 0 && gvl_waiting_at != GVL_WAITING_ENABLED_EMPTY;
|
1881
|
+
|
1882
|
+
if (!is_gvl_waiting_state) return false;
|
1883
|
+
|
1884
|
+
// We can be in one of 2 situations here:
|
1885
|
+
//
|
1886
|
+
// 1. The current sample is the first one after we entered the "Waiting for GVL" state
|
1887
|
+
// (wall_time_at_previous_sample_ns < abs(gvl_waiting_at))
|
1888
|
+
//
|
1889
|
+
// time ─────►
|
1890
|
+
// ...──────────────┬───────────────────...
|
1891
|
+
// Other state │ Waiting for GVL
|
1892
|
+
// ...──────────────┴───────────────────...
|
1893
|
+
// ▲ ▲
|
1894
|
+
// └─ Previous sample └─ Regular sample (caller)
|
1895
|
+
//
|
1896
|
+
// In this case, we'll want to push two samples: a) one for the current time (handled by the caller), b) an extra sample
|
1897
|
+
// to represent the remaining cpu/wall time before the "Waiting for GVL" started:
|
1898
|
+
//
|
1899
|
+
// time ─────►
|
1900
|
+
// ...──────────────┬───────────────────...
|
1901
|
+
// Other state │ Waiting for GVL
|
1902
|
+
// ...──────────────┴───────────────────...
|
1903
|
+
// ▲ ▲ ▲
|
1904
|
+
// └─ Prev... └─ Extra sample └─ Regular sample (caller)
|
1905
|
+
//
|
1906
|
+
// 2. The current sample is the n-th one after we entered the "Waiting for GVL" state
|
1907
|
+
// (wall_time_at_previous_sample_ns > abs(gvl_waiting_at))
|
1908
|
+
//
|
1909
|
+
// time ─────►
|
1910
|
+
// ...──────────────┬───────────────────────────────────────────────...
|
1911
|
+
// Other state │ Waiting for GVL
|
1912
|
+
// ...──────────────┴───────────────────────────────────────────────...
|
1913
|
+
// ▲ ▲ ▲
|
1914
|
+
// └─ Previous sample └─ Previous sample └─ Regular sample (caller)
|
1915
|
+
//
|
1916
|
+
// In this case, we just report back to the caller that the thread is in the "Waiting for GVL" state.
|
1917
|
+
//
|
1918
|
+
// ---
|
1919
|
+
//
|
1920
|
+
// Overall, gvl_waiting_at will be > 0 if still in the "Waiting for GVL" state and < 0 if we actually reached the end of
|
1921
|
+
// the wait.
|
1922
|
+
//
|
1923
|
+
// It doesn't really matter if the thread is still waiting or just reached the end of the wait: each sample represents
|
1924
|
+
// a snapshot at time ending now, so if the state finished, it just means the next sample will be a regular one.
|
1925
|
+
|
1926
|
+
if (gvl_waiting_at < 0) {
|
1927
|
+
// Negative means the waiting for GVL just ended, so we clear the state, so next samples no longer represent waiting
|
1928
|
+
gvl_profiling_state_thread_object_set(thread_being_sampled, GVL_WAITING_ENABLED_EMPTY);
|
1929
|
+
}
|
1930
|
+
|
1931
|
+
long gvl_waiting_started_wall_time_ns = labs(gvl_waiting_at);
|
1932
|
+
|
1933
|
+
if (thread_context->wall_time_at_previous_sample_ns < gvl_waiting_started_wall_time_ns) { // situation 1 above
|
1934
|
+
long cpu_time_elapsed_ns = update_time_since_previous_sample(
|
1935
|
+
&thread_context->cpu_time_at_previous_sample_ns,
|
1936
|
+
current_cpu_time_ns,
|
1937
|
+
thread_context->gc_tracking.cpu_time_at_start_ns,
|
1938
|
+
IS_NOT_WALL_TIME
|
1939
|
+
);
|
1940
|
+
|
1941
|
+
long duration_until_start_of_gvl_waiting_ns = update_time_since_previous_sample(
|
1942
|
+
&thread_context->wall_time_at_previous_sample_ns,
|
1943
|
+
gvl_waiting_started_wall_time_ns,
|
1944
|
+
INVALID_TIME,
|
1945
|
+
IS_WALL_TIME
|
1946
|
+
);
|
1947
|
+
|
1948
|
+
// Push extra sample
|
1949
|
+
trigger_sample_for_thread(
|
1950
|
+
state,
|
1951
|
+
thread_being_sampled,
|
1952
|
+
stack_from_thread,
|
1953
|
+
thread_context,
|
1954
|
+
sampling_buffer,
|
1955
|
+
(sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = duration_until_start_of_gvl_waiting_ns},
|
1956
|
+
gvl_waiting_started_wall_time_ns,
|
1957
|
+
NULL,
|
1958
|
+
NULL,
|
1959
|
+
false // This is the extra sample before the wait begun; only the next sample will be in the gvl waiting state
|
1960
|
+
);
|
1961
|
+
}
|
1962
|
+
|
1963
|
+
return true;
|
1964
|
+
}
|
1965
|
+
|
1966
|
+
static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) {
|
1967
|
+
ENFORCE_THREAD(thread);
|
1968
|
+
|
1969
|
+
thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread));
|
1970
|
+
return Qnil;
|
1971
|
+
}
|
1972
|
+
|
1973
|
+
static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) {
|
1974
|
+
ENFORCE_THREAD(thread);
|
1975
|
+
|
1976
|
+
intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread);
|
1977
|
+
return LONG2NUM(gvl_waiting_at);
|
1978
|
+
}
|
1979
|
+
|
1980
|
+
static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
|
1981
|
+
ENFORCE_THREAD(thread);
|
1982
|
+
|
1983
|
+
return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
|
1984
|
+
}
|
1985
|
+
|
1986
|
+
static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
|
1987
|
+
ENFORCE_THREAD(thread);
|
1988
|
+
|
1989
|
+
return thread_context_collector_sample_after_gvl_running_with_thread(collector_instance, thread);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {
|
1993
|
+
ENFORCE_THREAD(thread);
|
1994
|
+
|
1995
|
+
struct thread_context_collector_state *state;
|
1996
|
+
TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
|
1997
|
+
|
1998
|
+
struct per_thread_context *thread_context = get_context_for(thread, state);
|
1999
|
+
if (thread_context == NULL) rb_raise(rb_eArgError, "Unexpected: This method cannot be used unless the per-thread context for the thread already exists");
|
2000
|
+
|
2001
|
+
thread_context->cpu_time_at_previous_sample_ns += NUM2LONG(delta_ns);
|
2002
|
+
|
2003
|
+
return Qtrue;
|
2004
|
+
}
|
2005
|
+
|
2006
|
+
#else
|
2007
|
+
static bool handle_gvl_waiting(
|
2008
|
+
DDTRACE_UNUSED struct thread_context_collector_state *state,
|
2009
|
+
DDTRACE_UNUSED VALUE thread_being_sampled,
|
2010
|
+
DDTRACE_UNUSED VALUE stack_from_thread,
|
2011
|
+
DDTRACE_UNUSED struct per_thread_context *thread_context,
|
2012
|
+
DDTRACE_UNUSED sampling_buffer* sampling_buffer,
|
2013
|
+
DDTRACE_UNUSED long current_cpu_time_ns
|
2014
|
+
) { return false; }
|
2015
|
+
#endif // NO_GVL_INSTRUMENTATION
|