ddtrace 1.3.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +40 -2
- data/README.md +1 -1
- data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +10 -1
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +5 -4
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +391 -0
- data/ext/ddtrace_profiling_native_extension/extconf.rb +2 -0
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +9 -0
- data/ext/ddtrace_profiling_native_extension/profiling.c +2 -0
- data/ext/ddtrace_profiling_native_extension/stack_recorder.c +2 -1
- data/ext/ddtrace_profiling_native_extension/stack_recorder.h +8 -7
- data/lib/datadog/ci/contrib/cucumber/integration.rb +1 -1
- data/lib/datadog/ci/contrib/rspec/integration.rb +1 -1
- data/lib/datadog/core/configuration/base.rb +9 -0
- data/lib/datadog/core/configuration/components.rb +26 -6
- data/lib/datadog/core/configuration/settings.rb +25 -0
- data/lib/datadog/core/configuration.rb +4 -1
- data/lib/datadog/core/telemetry/client.rb +79 -0
- data/lib/datadog/core/telemetry/collector.rb +234 -0
- data/lib/datadog/core/telemetry/emitter.rb +48 -0
- data/lib/datadog/core/telemetry/event.rb +71 -0
- data/lib/datadog/core/telemetry/ext.rb +11 -0
- data/lib/datadog/core/telemetry/heartbeat.rb +37 -0
- data/lib/datadog/core/telemetry/http/adapters/net.rb +113 -0
- data/lib/datadog/core/telemetry/http/env.rb +20 -0
- data/lib/datadog/core/telemetry/http/ext.rb +20 -0
- data/lib/datadog/core/telemetry/http/response.rb +68 -0
- data/lib/datadog/core/telemetry/http/transport.rb +53 -0
- data/lib/datadog/core/telemetry/v1/app_event.rb +52 -0
- data/lib/datadog/core/telemetry/v1/application.rb +86 -0
- data/lib/datadog/core/telemetry/v1/configuration.rb +25 -0
- data/lib/datadog/core/telemetry/v1/dependency.rb +36 -0
- data/lib/datadog/core/telemetry/v1/host.rb +51 -0
- data/lib/datadog/core/telemetry/v1/integration.rb +58 -0
- data/lib/datadog/core/telemetry/v1/product.rb +28 -0
- data/lib/datadog/core/telemetry/v1/telemetry_request.rb +100 -0
- data/lib/datadog/core/utils/sequence.rb +5 -0
- data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +74 -0
- data/lib/datadog/profiling/stack_recorder.rb +1 -1
- data/lib/datadog/profiling.rb +1 -0
- data/lib/datadog/tracing/contrib/extensions.rb +2 -0
- data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +9 -0
- data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/patcher.rb +11 -0
- data/lib/datadog/tracing/contrib/rack/patcher.rb +8 -0
- data/lib/datadog/tracing/trace_operation.rb +1 -1
- data/lib/ddtrace/auto_instrument.rb +7 -0
- data/lib/ddtrace/transport/ext.rb +0 -1
- data/lib/ddtrace/transport/http/adapters/net.rb +1 -0
- data/lib/ddtrace/version.rb +2 -2
- metadata +26 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '01987691359b14541248048bb2f720031b4f0f0ca0be8fade0f6d0daf7aef8cc'
|
4
|
+
data.tar.gz: b2cce425177f3d619d0064d88ffb52073515b8d390e054117a160eac21e939d5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7232b588ee13d8a47a1fccbe39a8753a19a4e9c3b0b2dee2cb278a64b183500ec9859cfecffda8c128971bf88c93b148e6f915c2d7906139d1df050b173df42c
|
7
|
+
data.tar.gz: 2605640bffc586a0fd8bb38139d49fa6dad9f04bfe91a0e672cdd04a6c8a8a36896be5322a4b78ecab37175cd600bdc144109646712265b49bad4e103cc1ab5e
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,32 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [1.4.1] - 2022-09-15
|
6
|
+
|
7
|
+
### Fixed
|
8
|
+
|
9
|
+
* Missing distributed traces when trace is dropped by priority sampling ([#2101][], [#2279][])
|
10
|
+
* Profiling support when Ruby is compiled without a shared library ([#2250][])
|
11
|
+
|
12
|
+
## [1.4.0] - 2022-08-25
|
13
|
+
|
14
|
+
Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.4.0
|
15
|
+
|
16
|
+
Git diff: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
|
17
|
+
|
18
|
+
### Added
|
19
|
+
|
20
|
+
* gRPC: tag `grpc.client.deadline` ([#2200][])
|
21
|
+
* Implement telemetry, disable by default ([#2153][])
|
22
|
+
|
23
|
+
### Changed
|
24
|
+
|
25
|
+
* Bump `libdatadog` dependency version ([#2229][])
|
26
|
+
|
27
|
+
### Fixed
|
28
|
+
|
29
|
+
* Fix CI instrumentation configuration ([#2219][])
|
30
|
+
|
5
31
|
## [1.3.0] - 2022-08-04
|
6
32
|
|
7
33
|
Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.3.0
|
@@ -2071,7 +2097,11 @@ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v0.3.1
|
|
2071
2097
|
|
2072
2098
|
Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
2073
2099
|
|
2074
|
-
[Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.1
|
2100
|
+
[Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.1...master
|
2101
|
+
[1.4.1]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.0...v1.4.1
|
2102
|
+
[1.4.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
|
2103
|
+
[1.3.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.2.0...v1.3.0
|
2104
|
+
[1.2.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.1.0...v1.2.0
|
2075
2105
|
[1.1.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0...v1.1.0
|
2076
2106
|
[1.0.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta2...v1.0.0
|
2077
2107
|
[1.0.0.beta2]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta1...v1.0.0.beta2
|
@@ -2931,6 +2961,7 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
|
2931
2961
|
[#2082]: https://github.com/DataDog/dd-trace-rb/issues/2082
|
2932
2962
|
[#2096]: https://github.com/DataDog/dd-trace-rb/issues/2096
|
2933
2963
|
[#2097]: https://github.com/DataDog/dd-trace-rb/issues/2097
|
2964
|
+
[#2101]: https://github.com/DataDog/dd-trace-rb/issues/2101
|
2934
2965
|
[#2110]: https://github.com/DataDog/dd-trace-rb/issues/2110
|
2935
2966
|
[#2113]: https://github.com/DataDog/dd-trace-rb/issues/2113
|
2936
2967
|
[#2118]: https://github.com/DataDog/dd-trace-rb/issues/2118
|
@@ -2939,13 +2970,20 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
|
2939
2970
|
[#2138]: https://github.com/DataDog/dd-trace-rb/issues/2138
|
2940
2971
|
[#2140]: https://github.com/DataDog/dd-trace-rb/issues/2140
|
2941
2972
|
[#2150]: https://github.com/DataDog/dd-trace-rb/issues/2150
|
2973
|
+
[#2153]: https://github.com/DataDog/dd-trace-rb/issues/2153
|
2942
2974
|
[#2158]: https://github.com/DataDog/dd-trace-rb/issues/2158
|
2943
2975
|
[#2162]: https://github.com/DataDog/dd-trace-rb/issues/2162
|
2944
2976
|
[#2163]: https://github.com/DataDog/dd-trace-rb/issues/2163
|
2977
|
+
[#2170]: https://github.com/DataDog/dd-trace-rb/issues/2170
|
2945
2978
|
[#2173]: https://github.com/DataDog/dd-trace-rb/issues/2173
|
2946
2979
|
[#2174]: https://github.com/DataDog/dd-trace-rb/issues/2174
|
2947
2980
|
[#2180]: https://github.com/DataDog/dd-trace-rb/issues/2180
|
2981
|
+
[#2200]: https://github.com/DataDog/dd-trace-rb/issues/2200
|
2948
2982
|
[#2201]: https://github.com/DataDog/dd-trace-rb/issues/2201
|
2983
|
+
[#2219]: https://github.com/DataDog/dd-trace-rb/issues/2219
|
2984
|
+
[#2229]: https://github.com/DataDog/dd-trace-rb/issues/2229
|
2985
|
+
[#2250]: https://github.com/DataDog/dd-trace-rb/issues/2250
|
2986
|
+
[#2279]: https://github.com/DataDog/dd-trace-rb/issues/2279
|
2949
2987
|
[@AdrianLC]: https://github.com/AdrianLC
|
2950
2988
|
[@Azure7111]: https://github.com/Azure7111
|
2951
2989
|
[@BabyGroot]: https://github.com/BabyGroot
|
@@ -3087,4 +3125,4 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
|
3087
3125
|
[@walterking]: https://github.com/walterking
|
3088
3126
|
[@y-yagi]: https://github.com/y-yagi
|
3089
3127
|
[@yukimurasawa]: https://github.com/yukimurasawa
|
3090
|
-
[@zachmccormick]: https://github.com/zachmccormick
|
3128
|
+
[@zachmccormick]: https://github.com/zachmccormick
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
**We've recently released the 1.x version series. If you're upgrading from a 0.x version, check out our [upgrade guide](https://github.com/DataDog/dd-trace-rb/blob/master/docs/UpgradeGuide.md#from-0x-to-10).**
|
2
2
|
|
3
3
|
# Datadog Trace Client
|
4
4
|
|
@@ -85,7 +85,16 @@ static bool failed_to_load(void *handle, VALUE *failure_details) {
|
|
85
85
|
static bool incompatible_library(void *handle, VALUE *failure_details) {
|
86
86
|
// The library being loaded may be linked to a different libruby than the current executing Ruby.
|
87
87
|
// We check if this is the case by checking if a well-known symbol resolves to a common address.
|
88
|
-
|
88
|
+
|
89
|
+
void *xmalloc_from_library = dlsym(handle, "ruby_xmalloc");
|
90
|
+
|
91
|
+
if (xmalloc_from_library == NULL) {
|
92
|
+
// This happens when ruby is built without a `libruby.so` by using `--disable-shared` at compilation time.
|
93
|
+
// In this situation, no conflict between libruby version is possible.
|
94
|
+
return false;
|
95
|
+
}
|
96
|
+
|
97
|
+
if (xmalloc_from_library != &ruby_xmalloc) {
|
89
98
|
*failure_details = rb_str_new_cstr("library was compiled and linked to a different Ruby version");
|
90
99
|
unload_failed_library(handle);
|
91
100
|
return true;
|
@@ -153,8 +153,6 @@ static VALUE _native_new(VALUE klass) {
|
|
153
153
|
}
|
154
154
|
|
155
155
|
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames) {
|
156
|
-
enforce_recorder_instance(recorder_instance);
|
157
|
-
|
158
156
|
struct cpu_and_wall_time_collector_state *state;
|
159
157
|
TypedData_Get_Struct(collector_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
|
160
158
|
|
@@ -164,7 +162,7 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst
|
|
164
162
|
// Update this when modifying state struct
|
165
163
|
state->sampling_buffer = sampling_buffer_new(max_frames_requested);
|
166
164
|
// hash_map_per_thread_context is already initialized, nothing to do here
|
167
|
-
state->recorder_instance = recorder_instance;
|
165
|
+
state->recorder_instance = enforce_recorder_instance(recorder_instance);
|
168
166
|
|
169
167
|
return Qtrue;
|
170
168
|
}
|
@@ -180,6 +178,8 @@ static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance
|
|
180
178
|
//
|
181
179
|
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
|
182
180
|
// Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
|
181
|
+
// Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
|
182
|
+
// Assumption 4: This function IS NOT called in a reentrant way.
|
183
183
|
VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
|
184
184
|
struct cpu_and_wall_time_collector_state *state;
|
185
185
|
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
|
@@ -384,6 +384,7 @@ static long thread_id_for(VALUE thread) {
|
|
384
384
|
return FIXNUM_P(object_id) ? FIX2LONG(object_id) : -1;
|
385
385
|
}
|
386
386
|
|
387
|
-
|
387
|
+
VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object) {
|
388
388
|
Check_TypedStruct(object, &cpu_and_wall_time_collector_typed_data);
|
389
|
+
return object;
|
389
390
|
}
|
@@ -0,0 +1,391 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/thread.h>
|
3
|
+
#include <ruby/thread_native.h>
|
4
|
+
#include <ruby/debug.h>
|
5
|
+
#include <stdbool.h>
|
6
|
+
#include <signal.h>
|
7
|
+
#include "helpers.h"
|
8
|
+
#include "ruby_helpers.h"
|
9
|
+
#include "collectors_cpu_and_wall_time.h"
|
10
|
+
#include "private_vm_api_access.h"
|
11
|
+
|
12
|
+
// Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
|
13
|
+
// itself; this class only implements the "doing it periodically" part.
|
14
|
+
//
|
15
|
+
// This file implements the native bits of the Datadog::Profiling::Collectors::CpuAndWallTimeWorker class
|
16
|
+
|
17
|
+
// ---
|
18
|
+
// Here be dragons: This component is quite fiddly and probably one of the more complex in the profiler as it deals with
|
19
|
+
// multiple threads, signal handlers, global state, etc.
|
20
|
+
//
|
21
|
+
// ## Design notes for this class:
|
22
|
+
//
|
23
|
+
// ### Constraints
|
24
|
+
//
|
25
|
+
// Currently, sampling Ruby threads requires calling Ruby VM APIs that are only safe to call while holding on to the
|
26
|
+
// global VM lock (and are not async-signal safe -- cannot be called from a signal handler).
|
27
|
+
//
|
28
|
+
// @ivoanjo: As a note, I don't think we should think of this constraint as set in stone. Since can reach into the Ruby
|
29
|
+
// internals, we may be able to figure out a way of overcoming it. But it's definitely going to be hard so for now
|
30
|
+
// we're considering it as a given.
|
31
|
+
//
|
32
|
+
// ### Flow for triggering samples
|
33
|
+
//
|
34
|
+
// The flow for triggering samples is as follows:
|
35
|
+
//
|
36
|
+
// 1. Inside the `run_sampling_trigger_loop` function (running in the `CpuAndWallTimeWorker` background thread),
|
37
|
+
// a `SIGPROF` signal gets sent to the current process.
|
38
|
+
//
|
39
|
+
// 2. The `handle_sampling_signal` signal handler function gets called to handle the `SIGPROF` signal.
|
40
|
+
//
|
41
|
+
// Which thread the signal handler function gets called on by the operating system is quite important. We need to perform
|
42
|
+
// an operation -- calling the `rb_postponed_job_register_one` API -- that can only be called from the thread that
|
43
|
+
// is holding on to the global VM lock. So this is the thread we're "hoping" our signal lands on.
|
44
|
+
//
|
45
|
+
// The signal never lands on the `CpuAndWallTimeWorker` background thread because we explicitly block it off from that
|
46
|
+
// thread in `block_sigprof_signal_handler_from_running_in_current_thread`.
|
47
|
+
//
|
48
|
+
// If the signal lands on a thread that is not holding onto the global VM lock, we can't proceed to the next step,
|
49
|
+
// and we need to restart the sampling flow from step 1. (There's still quite a few improvements we can make here,
|
50
|
+
// but this is the current state of the implementation).
|
51
|
+
//
|
52
|
+
// 3. Inside `handle_sampling_signal`, if it's getting executed by the Ruby thread that is holding the global VM lock,
|
53
|
+
// we can call `rb_postponed_job_register_one` to ask the Ruby VM to call our `sample_from_postponed_job` function
|
54
|
+
// "as soon as it can".
|
55
|
+
//
|
56
|
+
// 4. The Ruby VM calls our `sample_from_postponed_job` from a thread holding the global VM lock. A sample is recorded by
|
57
|
+
// calling `cpu_and_wall_time_collector_sample`.
|
58
|
+
//
|
59
|
+
// ---
|
60
|
+
|
61
|
+
// Contains state for a single CpuAndWallTimeWorker instance
|
62
|
+
struct cpu_and_wall_time_worker_state {
|
63
|
+
// Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
|
64
|
+
// telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
|
65
|
+
// atomic operations. stdatomic.h seems a nice thing to reach out for.
|
66
|
+
volatile bool should_run;
|
67
|
+
|
68
|
+
VALUE cpu_and_wall_time_collector_instance;
|
69
|
+
// When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
|
70
|
+
// the CpuAndWallTimeWorker thread
|
71
|
+
VALUE failure_exception;
|
72
|
+
};
|
73
|
+
|
74
|
+
static VALUE _native_new(VALUE klass);
|
75
|
+
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
|
76
|
+
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
|
77
|
+
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
|
78
|
+
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
|
79
|
+
static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
|
80
|
+
static void remove_sigprof_signal_handler(void);
|
81
|
+
static void block_sigprof_signal_handler_from_running_in_current_thread(void);
|
82
|
+
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
|
83
|
+
static void *run_sampling_trigger_loop(void *state_ptr);
|
84
|
+
static void interrupt_sampling_trigger_loop(void *state_ptr);
|
85
|
+
static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
|
86
|
+
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
|
87
|
+
static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
|
88
|
+
static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
|
89
|
+
static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
|
90
|
+
static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
|
91
|
+
static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
|
92
|
+
static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
|
93
|
+
|
94
|
+
// Global state -- be very careful when accessing or modifying it
|
95
|
+
|
96
|
+
// Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
|
97
|
+
// The active_sampler_instance needs to be global because we access it from the signal handler.
|
98
|
+
static VALUE active_sampler_instance = Qnil;
|
99
|
+
// ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
|
100
|
+
// to detect when it is outdated)
|
101
|
+
static VALUE active_sampler_owner_thread = Qnil;
|
102
|
+
|
103
|
+
void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
|
104
|
+
rb_global_variable(&active_sampler_instance);
|
105
|
+
rb_global_variable(&active_sampler_owner_thread);
|
106
|
+
|
107
|
+
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
|
108
|
+
VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
|
109
|
+
// Hosts methods used for testing the native code using RSpec
|
110
|
+
VALUE testing_module = rb_define_module_under(collectors_cpu_and_wall_time_worker_class, "Testing");
|
111
|
+
|
112
|
+
// Instances of the CpuAndWallTimeWorker class are "TypedData" objects.
|
113
|
+
// "TypedData" objects are special objects in the Ruby VM that can wrap C structs.
|
114
|
+
// In this case, it wraps the cpu_and_wall_time_worker_state.
|
115
|
+
//
|
116
|
+
// Because Ruby doesn't know how to initialize native-level structs, we MUST override the allocation function for objects
|
117
|
+
// of this class so that we can manage this part. Not overriding or disabling the allocation function is a common
|
118
|
+
// gotcha for "TypedData" objects that can very easily lead to VM crashes, see for instance
|
119
|
+
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
|
120
|
+
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
|
121
|
+
|
122
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
|
123
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
|
124
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
|
125
|
+
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
|
126
|
+
rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
|
127
|
+
rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
|
128
|
+
rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
|
129
|
+
}
|
130
|
+
|
131
|
+
// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
|
132
|
+
// See also https://github.com/ruby/ruby/blob/master/doc/extension.rdoc for how this works
|
133
|
+
static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
|
134
|
+
.wrap_struct_name = "Datadog::Profiling::Collectors::CpuAndWallTimeWorker",
|
135
|
+
.function = {
|
136
|
+
.dmark = cpu_and_wall_time_worker_typed_data_mark,
|
137
|
+
.dfree = RUBY_DEFAULT_FREE,
|
138
|
+
.dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
|
139
|
+
//.dcompact = NULL, // FIXME: Add support for compaction
|
140
|
+
},
|
141
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
142
|
+
};
|
143
|
+
|
144
|
+
static VALUE _native_new(VALUE klass) {
|
145
|
+
struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
|
146
|
+
|
147
|
+
state->should_run = false;
|
148
|
+
state->cpu_and_wall_time_collector_instance = Qnil;
|
149
|
+
state->failure_exception = Qnil;
|
150
|
+
|
151
|
+
return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
|
152
|
+
}
|
153
|
+
|
154
|
+
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
|
155
|
+
struct cpu_and_wall_time_worker_state *state;
|
156
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
157
|
+
|
158
|
+
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
|
159
|
+
|
160
|
+
return Qtrue;
|
161
|
+
}
|
162
|
+
|
163
|
+
// Since our state contains references to Ruby objects, we need to tell the Ruby GC about them
|
164
|
+
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
|
165
|
+
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
166
|
+
|
167
|
+
rb_gc_mark(state->cpu_and_wall_time_collector_instance);
|
168
|
+
rb_gc_mark(state->failure_exception);
|
169
|
+
}
|
170
|
+
|
171
|
+
// Called in a background thread created in CpuAndWallTimeWorker#start
|
172
|
+
static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
173
|
+
struct cpu_and_wall_time_worker_state *state;
|
174
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
175
|
+
|
176
|
+
if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
|
177
|
+
rb_raise(
|
178
|
+
rb_eRuntimeError,
|
179
|
+
"Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
|
180
|
+
);
|
181
|
+
}
|
182
|
+
|
183
|
+
// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
|
184
|
+
active_sampler_instance = instance;
|
185
|
+
active_sampler_owner_thread = rb_thread_current();
|
186
|
+
|
187
|
+
state->should_run = true;
|
188
|
+
|
189
|
+
block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
|
190
|
+
|
191
|
+
install_sigprof_signal_handler(handle_sampling_signal);
|
192
|
+
|
193
|
+
// Release GVL, get to the actual work!
|
194
|
+
int exception_state;
|
195
|
+
rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
|
196
|
+
|
197
|
+
// The sample trigger loop finished (either cleanly or with an error); let's clean up
|
198
|
+
|
199
|
+
remove_sigprof_signal_handler();
|
200
|
+
active_sampler_instance = Qnil;
|
201
|
+
active_sampler_owner_thread = Qnil;
|
202
|
+
|
203
|
+
// Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
|
204
|
+
RB_GC_GUARD(instance);
|
205
|
+
|
206
|
+
if (exception_state) rb_jump_tag(exception_state); // Re-raise any exception that happened
|
207
|
+
|
208
|
+
return Qnil;
|
209
|
+
}
|
210
|
+
|
211
|
+
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
|
212
|
+
struct cpu_and_wall_time_worker_state *state;
|
213
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
214
|
+
|
215
|
+
state->should_run = false;
|
216
|
+
|
217
|
+
return Qtrue;
|
218
|
+
}
|
219
|
+
|
220
|
+
static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)) {
|
221
|
+
struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
|
222
|
+
struct sigaction signal_handler_config = {
|
223
|
+
.sa_flags = SA_RESTART | SA_SIGINFO,
|
224
|
+
.sa_sigaction = signal_handler_function
|
225
|
+
};
|
226
|
+
sigemptyset(&signal_handler_config.sa_mask);
|
227
|
+
|
228
|
+
if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) {
|
229
|
+
rb_sys_fail("Could not start CpuAndWallTimeWorker: Could not install signal handler");
|
230
|
+
}
|
231
|
+
|
232
|
+
// In some corner cases (e.g. after a fork), our signal handler may still be around, and that's ok
|
233
|
+
if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) return;
|
234
|
+
|
235
|
+
if (existing_signal_handler_config.sa_handler != NULL || existing_signal_handler_config.sa_sigaction != NULL) {
|
236
|
+
// A previous signal handler already existed. Currently we don't support this situation, so let's just back out
|
237
|
+
// of the installation.
|
238
|
+
|
239
|
+
if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) {
|
240
|
+
rb_sys_fail(
|
241
|
+
"Could not start CpuAndWallTimeWorker: Could not re-install pre-existing SIGPROF signal handler. " \
|
242
|
+
"This may break the component had installed it."
|
243
|
+
);
|
244
|
+
}
|
245
|
+
|
246
|
+
rb_raise(rb_eRuntimeError, "Could not start CpuAndWallTimeWorker: There's a pre-existing SIGPROF signal handler");
|
247
|
+
}
|
248
|
+
}
|
249
|
+
|
250
|
+
static void remove_sigprof_signal_handler(void) {
|
251
|
+
struct sigaction signal_handler_config = {
|
252
|
+
.sa_handler = SIG_DFL, // Reset back to default
|
253
|
+
.sa_flags = SA_RESTART // TODO: Unclear if this is actually needed/does anything at all
|
254
|
+
};
|
255
|
+
sigemptyset(&signal_handler_config.sa_mask);
|
256
|
+
|
257
|
+
if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
|
258
|
+
}
|
259
|
+
|
260
|
+
static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
|
261
|
+
sigset_t signals_to_block;
|
262
|
+
sigemptyset(&signals_to_block);
|
263
|
+
sigaddset(&signals_to_block, SIGPROF);
|
264
|
+
pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
|
265
|
+
}
|
266
|
+
|
267
|
+
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
268
|
+
if (!ruby_thread_has_gvl_p()) {
|
269
|
+
return; // Not safe to enqueue a sample from this thread
|
270
|
+
}
|
271
|
+
|
272
|
+
// We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
|
273
|
+
// a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
|
274
|
+
// b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
|
275
|
+
// because it will not have the global VM lock
|
276
|
+
// TODO: Validate that this does not impact Ractors
|
277
|
+
|
278
|
+
// Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
|
279
|
+
// then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
|
280
|
+
// may want to implement that check ourselves.
|
281
|
+
|
282
|
+
// TODO: Do something with result (potentially update tracking counters?)
|
283
|
+
/*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
|
284
|
+
}
|
285
|
+
|
286
|
+
// The actual sampling trigger loop always runs **without** the global vm lock.
|
287
|
+
static void *run_sampling_trigger_loop(void *state_ptr) {
|
288
|
+
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
289
|
+
|
290
|
+
struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
|
291
|
+
|
292
|
+
while (state->should_run) {
|
293
|
+
// TODO: This is still a placeholder for a more complex mechanism. In particular:
|
294
|
+
// * We want to signal a particular thread or threads, not the process in general
|
295
|
+
// * We want to track if a signal landed on the thread holding the global VM lock and do something about it
|
296
|
+
// * We want to do more than having a fixed sampling rate
|
297
|
+
|
298
|
+
kill(getpid(), SIGPROF);
|
299
|
+
nanosleep(&time_between_signals, NULL);
|
300
|
+
}
|
301
|
+
|
302
|
+
return NULL; // Unused
|
303
|
+
}
|
304
|
+
|
305
|
+
// This is called by the Ruby VM when it wants to shut down the background thread
|
306
|
+
static void interrupt_sampling_trigger_loop(void *state_ptr) {
|
307
|
+
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
308
|
+
|
309
|
+
state->should_run = false;
|
310
|
+
}
|
311
|
+
|
312
|
+
static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
313
|
+
VALUE instance = active_sampler_instance; // Read from global variable
|
314
|
+
|
315
|
+
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
|
316
|
+
if (instance == Qnil) return;
|
317
|
+
|
318
|
+
struct cpu_and_wall_time_worker_state *state;
|
319
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
320
|
+
|
321
|
+
// Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
|
322
|
+
VALUE (*function_to_call_safely)(VALUE) = cpu_and_wall_time_collector_sample;
|
323
|
+
VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
|
324
|
+
VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
|
325
|
+
VALUE exception_handler_function_arg = instance;
|
326
|
+
rb_rescue2(
|
327
|
+
function_to_call_safely,
|
328
|
+
function_to_call_safely_arg,
|
329
|
+
exception_handler_function,
|
330
|
+
exception_handler_function_arg,
|
331
|
+
rb_eException, // rb_eException is the base class of all Ruby exceptions
|
332
|
+
0 // Required by API to be the last argument
|
333
|
+
);
|
334
|
+
}
|
335
|
+
|
336
|
+
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
|
337
|
+
struct cpu_and_wall_time_worker_state *state;
|
338
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
339
|
+
|
340
|
+
state->should_run = false;
|
341
|
+
state->failure_exception = exception;
|
342
|
+
|
343
|
+
return Qnil;
|
344
|
+
}
|
345
|
+
|
346
|
+
static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
|
347
|
+
struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
|
348
|
+
if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
|
349
|
+
rb_sys_fail("Failed to probe existing handler");
|
350
|
+
}
|
351
|
+
|
352
|
+
if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) {
|
353
|
+
return ID2SYM(rb_intern("profiling"));
|
354
|
+
} else if (existing_signal_handler_config.sa_sigaction != NULL) {
|
355
|
+
return ID2SYM(rb_intern("other"));
|
356
|
+
} else {
|
357
|
+
return Qnil;
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
|
362
|
+
struct cpu_and_wall_time_worker_state *state;
|
363
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
364
|
+
|
365
|
+
rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
|
366
|
+
|
367
|
+
// If we stopped sampling due to an exception, re-raise it (now in the worker thread)
|
368
|
+
if (state->failure_exception != Qnil) rb_exc_raise(state->failure_exception);
|
369
|
+
|
370
|
+
return Qnil;
|
371
|
+
}
|
372
|
+
|
373
|
+
static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
|
374
|
+
return \
|
375
|
+
(active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
|
376
|
+
Qtrue : Qfalse;
|
377
|
+
}
|
378
|
+
|
379
|
+
static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
380
|
+
/* Does nothing on purpose */
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
|
384
|
+
install_sigprof_signal_handler(testing_signal_handler);
|
385
|
+
return Qtrue;
|
386
|
+
}
|
387
|
+
|
388
|
+
static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
|
389
|
+
remove_sigprof_signal_handler();
|
390
|
+
return Qtrue;
|
391
|
+
}
|
@@ -144,6 +144,8 @@ if RUBY_VERSION < '2.3'
|
|
144
144
|
$defs << '-DUSE_LEGACY_RB_PROFILE_FRAMES'
|
145
145
|
# ... you couldn't name threads
|
146
146
|
$defs << '-DNO_THREAD_NAMES'
|
147
|
+
# ...the ruby_thread_has_gvl_p function was not exposed to users outside of the VM
|
148
|
+
$defs << '-DNO_THREAD_HAS_GVL'
|
147
149
|
end
|
148
150
|
|
149
151
|
# If we got here, libdatadog is available and loaded
|
@@ -681,3 +681,12 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i
|
|
681
681
|
}
|
682
682
|
|
683
683
|
#endif // USE_LEGACY_RB_PROFILE_FRAMES
|
684
|
+
|
685
|
+
#ifdef NO_THREAD_HAS_GVL
|
686
|
+
int ruby_thread_has_gvl_p(void) {
|
687
|
+
// TODO: The CpuAndWallTimeWorker needs this function, but Ruby 2.2 doesn't expose it... For now this placeholder
|
688
|
+
// will enable the profiling native extension to continue to compile on Ruby 2.2, but the CpuAndWallTimeWorker will
|
689
|
+
// not work properly on 2.2. Will be addressed later.
|
690
|
+
return 0;
|
691
|
+
}
|
692
|
+
#endif // NO_THREAD_HAS_GVL
|
@@ -5,6 +5,7 @@
|
|
5
5
|
|
6
6
|
// Each class/module here is implemented in their separate file
|
7
7
|
void collectors_cpu_and_wall_time_init(VALUE profiling_module);
|
8
|
+
void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module);
|
8
9
|
void collectors_stack_init(VALUE profiling_module);
|
9
10
|
void http_transport_init(VALUE profiling_module);
|
10
11
|
void stack_recorder_init(VALUE profiling_module);
|
@@ -22,6 +23,7 @@ void DDTRACE_EXPORT Init_ddtrace_profiling_native_extension(void) {
|
|
22
23
|
rb_define_singleton_method(native_extension_module, "clock_id_for", clock_id_for, 1); // from clock_id.h
|
23
24
|
|
24
25
|
collectors_cpu_and_wall_time_init(profiling_module);
|
26
|
+
collectors_cpu_and_wall_time_worker_init(profiling_module);
|
25
27
|
collectors_stack_init(profiling_module);
|
26
28
|
http_transport_init(profiling_module);
|
27
29
|
stack_recorder_init(profiling_module);
|
@@ -324,8 +324,9 @@ static void *call_serialize_without_gvl(void *call_args) {
|
|
324
324
|
return NULL; // Unused
|
325
325
|
}
|
326
326
|
|
327
|
-
|
327
|
+
VALUE enforce_recorder_instance(VALUE object) {
|
328
328
|
Check_TypedStruct(object, &stack_recorder_typed_data);
|
329
|
+
return object;
|
329
330
|
}
|
330
331
|
|
331
332
|
static struct active_slot_pair sampler_lock_active_profile(struct stack_recorder_state *state) {
|
@@ -15,12 +15,13 @@
|
|
15
15
|
// ```
|
16
16
|
#define VALUE_STRING(string) {.ptr = "" string, .len = sizeof(string) - 1}
|
17
17
|
|
18
|
-
#define
|
19
|
-
#define
|
20
|
-
#define
|
21
|
-
#define
|
22
|
-
#define
|
23
|
-
#define
|
18
|
+
#define CPU_TIME_VALUE {.type_ = VALUE_STRING("cpu-time"), .unit = VALUE_STRING("nanoseconds")}
|
19
|
+
#define CPU_SAMPLES_VALUE {.type_ = VALUE_STRING("cpu-samples"), .unit = VALUE_STRING("count")}
|
20
|
+
#define WALL_TIME_VALUE {.type_ = VALUE_STRING("wall-time"), .unit = VALUE_STRING("nanoseconds")}
|
21
|
+
#define ALLOC_SIZE_VALUE {.type_ = VALUE_STRING("alloc-size"), .unit = VALUE_STRING("bytes")}
|
22
|
+
#define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")}
|
23
|
+
#define HEAP_LIVE_SIZE_VALUE {.type_ = VALUE_STRING("heap-live-size"), .unit = VALUE_STRING("bytes")}
|
24
|
+
#define HEAP_LIVE_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")}
|
24
25
|
|
25
26
|
static const ddprof_ffi_ValueType enabled_value_types[] = {
|
26
27
|
#define CPU_TIME_VALUE_POS 0
|
@@ -34,4 +35,4 @@ static const ddprof_ffi_ValueType enabled_value_types[] = {
|
|
34
35
|
#define ENABLED_VALUE_TYPES_COUNT (sizeof(enabled_value_types) / sizeof(ddprof_ffi_ValueType))
|
35
36
|
|
36
37
|
void record_sample(VALUE recorder_instance, ddprof_ffi_Sample sample);
|
37
|
-
|
38
|
+
VALUE enforce_recorder_instance(VALUE object);
|