ddtrace 1.3.0 → 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +40 -2
- data/README.md +1 -1
- data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +10 -1
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +5 -4
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
- data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +391 -0
- data/ext/ddtrace_profiling_native_extension/extconf.rb +2 -0
- data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +9 -0
- data/ext/ddtrace_profiling_native_extension/profiling.c +2 -0
- data/ext/ddtrace_profiling_native_extension/stack_recorder.c +2 -1
- data/ext/ddtrace_profiling_native_extension/stack_recorder.h +8 -7
- data/lib/datadog/ci/contrib/cucumber/integration.rb +1 -1
- data/lib/datadog/ci/contrib/rspec/integration.rb +1 -1
- data/lib/datadog/core/configuration/base.rb +9 -0
- data/lib/datadog/core/configuration/components.rb +26 -6
- data/lib/datadog/core/configuration/settings.rb +25 -0
- data/lib/datadog/core/configuration.rb +4 -1
- data/lib/datadog/core/telemetry/client.rb +79 -0
- data/lib/datadog/core/telemetry/collector.rb +234 -0
- data/lib/datadog/core/telemetry/emitter.rb +48 -0
- data/lib/datadog/core/telemetry/event.rb +71 -0
- data/lib/datadog/core/telemetry/ext.rb +11 -0
- data/lib/datadog/core/telemetry/heartbeat.rb +37 -0
- data/lib/datadog/core/telemetry/http/adapters/net.rb +113 -0
- data/lib/datadog/core/telemetry/http/env.rb +20 -0
- data/lib/datadog/core/telemetry/http/ext.rb +20 -0
- data/lib/datadog/core/telemetry/http/response.rb +68 -0
- data/lib/datadog/core/telemetry/http/transport.rb +53 -0
- data/lib/datadog/core/telemetry/v1/app_event.rb +52 -0
- data/lib/datadog/core/telemetry/v1/application.rb +86 -0
- data/lib/datadog/core/telemetry/v1/configuration.rb +25 -0
- data/lib/datadog/core/telemetry/v1/dependency.rb +36 -0
- data/lib/datadog/core/telemetry/v1/host.rb +51 -0
- data/lib/datadog/core/telemetry/v1/integration.rb +58 -0
- data/lib/datadog/core/telemetry/v1/product.rb +28 -0
- data/lib/datadog/core/telemetry/v1/telemetry_request.rb +100 -0
- data/lib/datadog/core/utils/sequence.rb +5 -0
- data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +74 -0
- data/lib/datadog/profiling/stack_recorder.rb +1 -1
- data/lib/datadog/profiling.rb +1 -0
- data/lib/datadog/tracing/contrib/extensions.rb +2 -0
- data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +9 -0
- data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
- data/lib/datadog/tracing/contrib/patcher.rb +11 -0
- data/lib/datadog/tracing/contrib/rack/patcher.rb +8 -0
- data/lib/datadog/tracing/trace_operation.rb +1 -1
- data/lib/ddtrace/auto_instrument.rb +7 -0
- data/lib/ddtrace/transport/ext.rb +0 -1
- data/lib/ddtrace/transport/http/adapters/net.rb +1 -0
- data/lib/ddtrace/version.rb +2 -2
- metadata +26 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '01987691359b14541248048bb2f720031b4f0f0ca0be8fade0f6d0daf7aef8cc'
|
4
|
+
data.tar.gz: b2cce425177f3d619d0064d88ffb52073515b8d390e054117a160eac21e939d5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7232b588ee13d8a47a1fccbe39a8753a19a4e9c3b0b2dee2cb278a64b183500ec9859cfecffda8c128971bf88c93b148e6f915c2d7906139d1df050b173df42c
|
7
|
+
data.tar.gz: 2605640bffc586a0fd8bb38139d49fa6dad9f04bfe91a0e672cdd04a6c8a8a36896be5322a4b78ecab37175cd600bdc144109646712265b49bad4e103cc1ab5e
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,32 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [1.4.1] - 2022-09-15
|
6
|
+
|
7
|
+
### Fixed
|
8
|
+
|
9
|
+
* Missing distributed traces when trace is dropped by priority sampling ([#2101][], [#2279][])
|
10
|
+
* Profiling support when Ruby is compiled without a shared library ([#2250][])
|
11
|
+
|
12
|
+
## [1.4.0] - 2022-08-25
|
13
|
+
|
14
|
+
Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.4.0
|
15
|
+
|
16
|
+
Git diff: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
|
17
|
+
|
18
|
+
### Added
|
19
|
+
|
20
|
+
* gRPC: tag `grpc.client.deadline` ([#2200][])
|
21
|
+
* Implement telemetry, disable by default ([#2153][])
|
22
|
+
|
23
|
+
### Changed
|
24
|
+
|
25
|
+
* Bump `libdatadog` dependency version ([#2229][])
|
26
|
+
|
27
|
+
### Fixed
|
28
|
+
|
29
|
+
* Fix CI instrumentation configuration ([#2219][])
|
30
|
+
|
5
31
|
## [1.3.0] - 2022-08-04
|
6
32
|
|
7
33
|
Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.3.0
|
@@ -2071,7 +2097,11 @@ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v0.3.1
|
|
2071
2097
|
|
2072
2098
|
Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
2073
2099
|
|
2074
|
-
[Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.1
|
2100
|
+
[Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.1...master
|
2101
|
+
[1.4.1]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.0...v1.4.1
|
2102
|
+
[1.4.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
|
2103
|
+
[1.3.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.2.0...v1.3.0
|
2104
|
+
[1.2.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.1.0...v1.2.0
|
2075
2105
|
[1.1.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0...v1.1.0
|
2076
2106
|
[1.0.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta2...v1.0.0
|
2077
2107
|
[1.0.0.beta2]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta1...v1.0.0.beta2
|
@@ -2931,6 +2961,7 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
|
2931
2961
|
[#2082]: https://github.com/DataDog/dd-trace-rb/issues/2082
|
2932
2962
|
[#2096]: https://github.com/DataDog/dd-trace-rb/issues/2096
|
2933
2963
|
[#2097]: https://github.com/DataDog/dd-trace-rb/issues/2097
|
2964
|
+
[#2101]: https://github.com/DataDog/dd-trace-rb/issues/2101
|
2934
2965
|
[#2110]: https://github.com/DataDog/dd-trace-rb/issues/2110
|
2935
2966
|
[#2113]: https://github.com/DataDog/dd-trace-rb/issues/2113
|
2936
2967
|
[#2118]: https://github.com/DataDog/dd-trace-rb/issues/2118
|
@@ -2939,13 +2970,20 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
|
2939
2970
|
[#2138]: https://github.com/DataDog/dd-trace-rb/issues/2138
|
2940
2971
|
[#2140]: https://github.com/DataDog/dd-trace-rb/issues/2140
|
2941
2972
|
[#2150]: https://github.com/DataDog/dd-trace-rb/issues/2150
|
2973
|
+
[#2153]: https://github.com/DataDog/dd-trace-rb/issues/2153
|
2942
2974
|
[#2158]: https://github.com/DataDog/dd-trace-rb/issues/2158
|
2943
2975
|
[#2162]: https://github.com/DataDog/dd-trace-rb/issues/2162
|
2944
2976
|
[#2163]: https://github.com/DataDog/dd-trace-rb/issues/2163
|
2977
|
+
[#2170]: https://github.com/DataDog/dd-trace-rb/issues/2170
|
2945
2978
|
[#2173]: https://github.com/DataDog/dd-trace-rb/issues/2173
|
2946
2979
|
[#2174]: https://github.com/DataDog/dd-trace-rb/issues/2174
|
2947
2980
|
[#2180]: https://github.com/DataDog/dd-trace-rb/issues/2180
|
2981
|
+
[#2200]: https://github.com/DataDog/dd-trace-rb/issues/2200
|
2948
2982
|
[#2201]: https://github.com/DataDog/dd-trace-rb/issues/2201
|
2983
|
+
[#2219]: https://github.com/DataDog/dd-trace-rb/issues/2219
|
2984
|
+
[#2229]: https://github.com/DataDog/dd-trace-rb/issues/2229
|
2985
|
+
[#2250]: https://github.com/DataDog/dd-trace-rb/issues/2250
|
2986
|
+
[#2279]: https://github.com/DataDog/dd-trace-rb/issues/2279
|
2949
2987
|
[@AdrianLC]: https://github.com/AdrianLC
|
2950
2988
|
[@Azure7111]: https://github.com/Azure7111
|
2951
2989
|
[@BabyGroot]: https://github.com/BabyGroot
|
@@ -3087,4 +3125,4 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
|
|
3087
3125
|
[@walterking]: https://github.com/walterking
|
3088
3126
|
[@y-yagi]: https://github.com/y-yagi
|
3089
3127
|
[@yukimurasawa]: https://github.com/yukimurasawa
|
3090
|
-
[@zachmccormick]: https://github.com/zachmccormick
|
3128
|
+
[@zachmccormick]: https://github.com/zachmccormick
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
**We've recently released the 1.x version series. If you're upgrading from a 0.x version, check out our [upgrade guide](https://github.com/DataDog/dd-trace-rb/blob/master/docs/UpgradeGuide.md#from-0x-to-10).**
|
2
2
|
|
3
3
|
# Datadog Trace Client
|
4
4
|
|
@@ -85,7 +85,16 @@ static bool failed_to_load(void *handle, VALUE *failure_details) {
|
|
85
85
|
static bool incompatible_library(void *handle, VALUE *failure_details) {
|
86
86
|
// The library being loaded may be linked to a different libruby than the current executing Ruby.
|
87
87
|
// We check if this is the case by checking if a well-known symbol resolves to a common address.
|
88
|
-
|
88
|
+
|
89
|
+
void *xmalloc_from_library = dlsym(handle, "ruby_xmalloc");
|
90
|
+
|
91
|
+
if (xmalloc_from_library == NULL) {
|
92
|
+
// This happens when ruby is built without a `libruby.so` by using `--disable-shared` at compilation time.
|
93
|
+
// In this situation, no conflict between libruby version is possible.
|
94
|
+
return false;
|
95
|
+
}
|
96
|
+
|
97
|
+
if (xmalloc_from_library != &ruby_xmalloc) {
|
89
98
|
*failure_details = rb_str_new_cstr("library was compiled and linked to a different Ruby version");
|
90
99
|
unload_failed_library(handle);
|
91
100
|
return true;
|
@@ -153,8 +153,6 @@ static VALUE _native_new(VALUE klass) {
|
|
153
153
|
}
|
154
154
|
|
155
155
|
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames) {
|
156
|
-
enforce_recorder_instance(recorder_instance);
|
157
|
-
|
158
156
|
struct cpu_and_wall_time_collector_state *state;
|
159
157
|
TypedData_Get_Struct(collector_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
|
160
158
|
|
@@ -164,7 +162,7 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst
|
|
164
162
|
// Update this when modifying state struct
|
165
163
|
state->sampling_buffer = sampling_buffer_new(max_frames_requested);
|
166
164
|
// hash_map_per_thread_context is already initialized, nothing to do here
|
167
|
-
state->recorder_instance = recorder_instance;
|
165
|
+
state->recorder_instance = enforce_recorder_instance(recorder_instance);
|
168
166
|
|
169
167
|
return Qtrue;
|
170
168
|
}
|
@@ -180,6 +178,8 @@ static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance
|
|
180
178
|
//
|
181
179
|
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
|
182
180
|
// Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
|
181
|
+
// Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
|
182
|
+
// Assumption 4: This function IS NOT called in a reentrant way.
|
183
183
|
VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
|
184
184
|
struct cpu_and_wall_time_collector_state *state;
|
185
185
|
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
|
@@ -384,6 +384,7 @@ static long thread_id_for(VALUE thread) {
|
|
384
384
|
return FIXNUM_P(object_id) ? FIX2LONG(object_id) : -1;
|
385
385
|
}
|
386
386
|
|
387
|
-
|
387
|
+
VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object) {
|
388
388
|
Check_TypedStruct(object, &cpu_and_wall_time_collector_typed_data);
|
389
|
+
return object;
|
389
390
|
}
|
@@ -0,0 +1,391 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/thread.h>
|
3
|
+
#include <ruby/thread_native.h>
|
4
|
+
#include <ruby/debug.h>
|
5
|
+
#include <stdbool.h>
|
6
|
+
#include <signal.h>
|
7
|
+
#include "helpers.h"
|
8
|
+
#include "ruby_helpers.h"
|
9
|
+
#include "collectors_cpu_and_wall_time.h"
|
10
|
+
#include "private_vm_api_access.h"
|
11
|
+
|
12
|
+
// Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
|
13
|
+
// itself; this class only implements the "doing it periodically" part.
|
14
|
+
//
|
15
|
+
// This file implements the native bits of the Datadog::Profiling::Collectors::CpuAndWallTimeWorker class
|
16
|
+
|
17
|
+
// ---
|
18
|
+
// Here be dragons: This component is quite fiddly and probably one of the more complex in the profiler as it deals with
|
19
|
+
// multiple threads, signal handlers, global state, etc.
|
20
|
+
//
|
21
|
+
// ## Design notes for this class:
|
22
|
+
//
|
23
|
+
// ### Constraints
|
24
|
+
//
|
25
|
+
// Currently, sampling Ruby threads requires calling Ruby VM APIs that are only safe to call while holding on to the
|
26
|
+
// global VM lock (and are not async-signal safe -- cannot be called from a signal handler).
|
27
|
+
//
|
28
|
+
// @ivoanjo: As a note, I don't think we should think of this constraint as set in stone. Since can reach into the Ruby
|
29
|
+
// internals, we may be able to figure out a way of overcoming it. But it's definitely going to be hard so for now
|
30
|
+
// we're considering it as a given.
|
31
|
+
//
|
32
|
+
// ### Flow for triggering samples
|
33
|
+
//
|
34
|
+
// The flow for triggering samples is as follows:
|
35
|
+
//
|
36
|
+
// 1. Inside the `run_sampling_trigger_loop` function (running in the `CpuAndWallTimeWorker` background thread),
|
37
|
+
// a `SIGPROF` signal gets sent to the current process.
|
38
|
+
//
|
39
|
+
// 2. The `handle_sampling_signal` signal handler function gets called to handle the `SIGPROF` signal.
|
40
|
+
//
|
41
|
+
// Which thread the signal handler function gets called on by the operating system is quite important. We need to perform
|
42
|
+
// an operation -- calling the `rb_postponed_job_register_one` API -- that can only be called from the thread that
|
43
|
+
// is holding on to the global VM lock. So this is the thread we're "hoping" our signal lands on.
|
44
|
+
//
|
45
|
+
// The signal never lands on the `CpuAndWallTimeWorker` background thread because we explicitly block it off from that
|
46
|
+
// thread in `block_sigprof_signal_handler_from_running_in_current_thread`.
|
47
|
+
//
|
48
|
+
// If the signal lands on a thread that is not holding onto the global VM lock, we can't proceed to the next step,
|
49
|
+
// and we need to restart the sampling flow from step 1. (There's still quite a few improvements we can make here,
|
50
|
+
// but this is the current state of the implementation).
|
51
|
+
//
|
52
|
+
// 3. Inside `handle_sampling_signal`, if it's getting executed by the Ruby thread that is holding the global VM lock,
|
53
|
+
// we can call `rb_postponed_job_register_one` to ask the Ruby VM to call our `sample_from_postponed_job` function
|
54
|
+
// "as soon as it can".
|
55
|
+
//
|
56
|
+
// 4. The Ruby VM calls our `sample_from_postponed_job` from a thread holding the global VM lock. A sample is recorded by
|
57
|
+
// calling `cpu_and_wall_time_collector_sample`.
|
58
|
+
//
|
59
|
+
// ---
|
60
|
+
|
61
|
+
// Contains state for a single CpuAndWallTimeWorker instance
|
62
|
+
struct cpu_and_wall_time_worker_state {
|
63
|
+
// Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
|
64
|
+
// telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
|
65
|
+
// atomic operations. stdatomic.h seems a nice thing to reach out for.
|
66
|
+
volatile bool should_run;
|
67
|
+
|
68
|
+
VALUE cpu_and_wall_time_collector_instance;
|
69
|
+
// When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
|
70
|
+
// the CpuAndWallTimeWorker thread
|
71
|
+
VALUE failure_exception;
|
72
|
+
};
|
73
|
+
|
74
|
+
static VALUE _native_new(VALUE klass);
|
75
|
+
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
|
76
|
+
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
|
77
|
+
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
|
78
|
+
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
|
79
|
+
static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
|
80
|
+
static void remove_sigprof_signal_handler(void);
|
81
|
+
static void block_sigprof_signal_handler_from_running_in_current_thread(void);
|
82
|
+
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
|
83
|
+
static void *run_sampling_trigger_loop(void *state_ptr);
|
84
|
+
static void interrupt_sampling_trigger_loop(void *state_ptr);
|
85
|
+
static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
|
86
|
+
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
|
87
|
+
static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
|
88
|
+
static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
|
89
|
+
static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
|
90
|
+
static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
|
91
|
+
static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
|
92
|
+
static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
|
93
|
+
|
94
|
+
// Global state -- be very careful when accessing or modifying it
|
95
|
+
|
96
|
+
// Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
|
97
|
+
// The active_sampler_instance needs to be global because we access it from the signal handler.
|
98
|
+
static VALUE active_sampler_instance = Qnil;
|
99
|
+
// ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
|
100
|
+
// to detect when it is outdated)
|
101
|
+
static VALUE active_sampler_owner_thread = Qnil;
|
102
|
+
|
103
|
+
void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
|
104
|
+
rb_global_variable(&active_sampler_instance);
|
105
|
+
rb_global_variable(&active_sampler_owner_thread);
|
106
|
+
|
107
|
+
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
|
108
|
+
VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
|
109
|
+
// Hosts methods used for testing the native code using RSpec
|
110
|
+
VALUE testing_module = rb_define_module_under(collectors_cpu_and_wall_time_worker_class, "Testing");
|
111
|
+
|
112
|
+
// Instances of the CpuAndWallTimeWorker class are "TypedData" objects.
|
113
|
+
// "TypedData" objects are special objects in the Ruby VM that can wrap C structs.
|
114
|
+
// In this case, it wraps the cpu_and_wall_time_worker_state.
|
115
|
+
//
|
116
|
+
// Because Ruby doesn't know how to initialize native-level structs, we MUST override the allocation function for objects
|
117
|
+
// of this class so that we can manage this part. Not overriding or disabling the allocation function is a common
|
118
|
+
// gotcha for "TypedData" objects that can very easily lead to VM crashes, see for instance
|
119
|
+
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
|
120
|
+
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
|
121
|
+
|
122
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
|
123
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
|
124
|
+
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
|
125
|
+
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
|
126
|
+
rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
|
127
|
+
rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
|
128
|
+
rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
|
129
|
+
}
|
130
|
+
|
131
|
+
// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
|
132
|
+
// See also https://github.com/ruby/ruby/blob/master/doc/extension.rdoc for how this works
|
133
|
+
static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
|
134
|
+
.wrap_struct_name = "Datadog::Profiling::Collectors::CpuAndWallTimeWorker",
|
135
|
+
.function = {
|
136
|
+
.dmark = cpu_and_wall_time_worker_typed_data_mark,
|
137
|
+
.dfree = RUBY_DEFAULT_FREE,
|
138
|
+
.dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
|
139
|
+
//.dcompact = NULL, // FIXME: Add support for compaction
|
140
|
+
},
|
141
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
142
|
+
};
|
143
|
+
|
144
|
+
static VALUE _native_new(VALUE klass) {
|
145
|
+
struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
|
146
|
+
|
147
|
+
state->should_run = false;
|
148
|
+
state->cpu_and_wall_time_collector_instance = Qnil;
|
149
|
+
state->failure_exception = Qnil;
|
150
|
+
|
151
|
+
return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
|
152
|
+
}
|
153
|
+
|
154
|
+
static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
|
155
|
+
struct cpu_and_wall_time_worker_state *state;
|
156
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
157
|
+
|
158
|
+
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
|
159
|
+
|
160
|
+
return Qtrue;
|
161
|
+
}
|
162
|
+
|
163
|
+
// Since our state contains references to Ruby objects, we need to tell the Ruby GC about them
|
164
|
+
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
|
165
|
+
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
166
|
+
|
167
|
+
rb_gc_mark(state->cpu_and_wall_time_collector_instance);
|
168
|
+
rb_gc_mark(state->failure_exception);
|
169
|
+
}
|
170
|
+
|
171
|
+
// Called in a background thread created in CpuAndWallTimeWorker#start
|
172
|
+
static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
|
173
|
+
struct cpu_and_wall_time_worker_state *state;
|
174
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
175
|
+
|
176
|
+
if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
|
177
|
+
rb_raise(
|
178
|
+
rb_eRuntimeError,
|
179
|
+
"Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
|
180
|
+
);
|
181
|
+
}
|
182
|
+
|
183
|
+
// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
|
184
|
+
active_sampler_instance = instance;
|
185
|
+
active_sampler_owner_thread = rb_thread_current();
|
186
|
+
|
187
|
+
state->should_run = true;
|
188
|
+
|
189
|
+
block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
|
190
|
+
|
191
|
+
install_sigprof_signal_handler(handle_sampling_signal);
|
192
|
+
|
193
|
+
// Release GVL, get to the actual work!
|
194
|
+
int exception_state;
|
195
|
+
rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
|
196
|
+
|
197
|
+
// The sample trigger loop finished (either cleanly or with an error); let's clean up
|
198
|
+
|
199
|
+
remove_sigprof_signal_handler();
|
200
|
+
active_sampler_instance = Qnil;
|
201
|
+
active_sampler_owner_thread = Qnil;
|
202
|
+
|
203
|
+
// Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
|
204
|
+
RB_GC_GUARD(instance);
|
205
|
+
|
206
|
+
if (exception_state) rb_jump_tag(exception_state); // Re-raise any exception that happened
|
207
|
+
|
208
|
+
return Qnil;
|
209
|
+
}
|
210
|
+
|
211
|
+
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
|
212
|
+
struct cpu_and_wall_time_worker_state *state;
|
213
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
214
|
+
|
215
|
+
state->should_run = false;
|
216
|
+
|
217
|
+
return Qtrue;
|
218
|
+
}
|
219
|
+
|
220
|
+
static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)) {
|
221
|
+
struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
|
222
|
+
struct sigaction signal_handler_config = {
|
223
|
+
.sa_flags = SA_RESTART | SA_SIGINFO,
|
224
|
+
.sa_sigaction = signal_handler_function
|
225
|
+
};
|
226
|
+
sigemptyset(&signal_handler_config.sa_mask);
|
227
|
+
|
228
|
+
if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) {
|
229
|
+
rb_sys_fail("Could not start CpuAndWallTimeWorker: Could not install signal handler");
|
230
|
+
}
|
231
|
+
|
232
|
+
// In some corner cases (e.g. after a fork), our signal handler may still be around, and that's ok
|
233
|
+
if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) return;
|
234
|
+
|
235
|
+
if (existing_signal_handler_config.sa_handler != NULL || existing_signal_handler_config.sa_sigaction != NULL) {
|
236
|
+
// A previous signal handler already existed. Currently we don't support this situation, so let's just back out
|
237
|
+
// of the installation.
|
238
|
+
|
239
|
+
if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) {
|
240
|
+
rb_sys_fail(
|
241
|
+
"Could not start CpuAndWallTimeWorker: Could not re-install pre-existing SIGPROF signal handler. " \
|
242
|
+
"This may break the component had installed it."
|
243
|
+
);
|
244
|
+
}
|
245
|
+
|
246
|
+
rb_raise(rb_eRuntimeError, "Could not start CpuAndWallTimeWorker: There's a pre-existing SIGPROF signal handler");
|
247
|
+
}
|
248
|
+
}
|
249
|
+
|
250
|
+
static void remove_sigprof_signal_handler(void) {
|
251
|
+
struct sigaction signal_handler_config = {
|
252
|
+
.sa_handler = SIG_DFL, // Reset back to default
|
253
|
+
.sa_flags = SA_RESTART // TODO: Unclear if this is actually needed/does anything at all
|
254
|
+
};
|
255
|
+
sigemptyset(&signal_handler_config.sa_mask);
|
256
|
+
|
257
|
+
if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
|
258
|
+
}
|
259
|
+
|
260
|
+
static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
|
261
|
+
sigset_t signals_to_block;
|
262
|
+
sigemptyset(&signals_to_block);
|
263
|
+
sigaddset(&signals_to_block, SIGPROF);
|
264
|
+
pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
|
265
|
+
}
|
266
|
+
|
267
|
+
static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
268
|
+
if (!ruby_thread_has_gvl_p()) {
|
269
|
+
return; // Not safe to enqueue a sample from this thread
|
270
|
+
}
|
271
|
+
|
272
|
+
// We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
|
273
|
+
// a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
|
274
|
+
// b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
|
275
|
+
// because it will not have the global VM lock
|
276
|
+
// TODO: Validate that this does not impact Ractors
|
277
|
+
|
278
|
+
// Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
|
279
|
+
// then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
|
280
|
+
// may want to implement that check ourselves.
|
281
|
+
|
282
|
+
// TODO: Do something with result (potentially update tracking counters?)
|
283
|
+
/*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
|
284
|
+
}
|
285
|
+
|
286
|
+
// The actual sampling trigger loop always runs **without** the global vm lock.
|
287
|
+
static void *run_sampling_trigger_loop(void *state_ptr) {
|
288
|
+
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
289
|
+
|
290
|
+
struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
|
291
|
+
|
292
|
+
while (state->should_run) {
|
293
|
+
// TODO: This is still a placeholder for a more complex mechanism. In particular:
|
294
|
+
// * We want to signal a particular thread or threads, not the process in general
|
295
|
+
// * We want to track if a signal landed on the thread holding the global VM lock and do something about it
|
296
|
+
// * We want to do more than having a fixed sampling rate
|
297
|
+
|
298
|
+
kill(getpid(), SIGPROF);
|
299
|
+
nanosleep(&time_between_signals, NULL);
|
300
|
+
}
|
301
|
+
|
302
|
+
return NULL; // Unused
|
303
|
+
}
|
304
|
+
|
305
|
+
// This is called by the Ruby VM when it wants to shut down the background thread
|
306
|
+
static void interrupt_sampling_trigger_loop(void *state_ptr) {
|
307
|
+
struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
|
308
|
+
|
309
|
+
state->should_run = false;
|
310
|
+
}
|
311
|
+
|
312
|
+
static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
|
313
|
+
VALUE instance = active_sampler_instance; // Read from global variable
|
314
|
+
|
315
|
+
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
|
316
|
+
if (instance == Qnil) return;
|
317
|
+
|
318
|
+
struct cpu_and_wall_time_worker_state *state;
|
319
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
320
|
+
|
321
|
+
// Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
|
322
|
+
VALUE (*function_to_call_safely)(VALUE) = cpu_and_wall_time_collector_sample;
|
323
|
+
VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
|
324
|
+
VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
|
325
|
+
VALUE exception_handler_function_arg = instance;
|
326
|
+
rb_rescue2(
|
327
|
+
function_to_call_safely,
|
328
|
+
function_to_call_safely_arg,
|
329
|
+
exception_handler_function,
|
330
|
+
exception_handler_function_arg,
|
331
|
+
rb_eException, // rb_eException is the base class of all Ruby exceptions
|
332
|
+
0 // Required by API to be the last argument
|
333
|
+
);
|
334
|
+
}
|
335
|
+
|
336
|
+
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
|
337
|
+
struct cpu_and_wall_time_worker_state *state;
|
338
|
+
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
339
|
+
|
340
|
+
state->should_run = false;
|
341
|
+
state->failure_exception = exception;
|
342
|
+
|
343
|
+
return Qnil;
|
344
|
+
}
|
345
|
+
|
346
|
+
static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
|
347
|
+
struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
|
348
|
+
if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
|
349
|
+
rb_sys_fail("Failed to probe existing handler");
|
350
|
+
}
|
351
|
+
|
352
|
+
if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) {
|
353
|
+
return ID2SYM(rb_intern("profiling"));
|
354
|
+
} else if (existing_signal_handler_config.sa_sigaction != NULL) {
|
355
|
+
return ID2SYM(rb_intern("other"));
|
356
|
+
} else {
|
357
|
+
return Qnil;
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
|
362
|
+
struct cpu_and_wall_time_worker_state *state;
|
363
|
+
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
|
364
|
+
|
365
|
+
rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
|
366
|
+
|
367
|
+
// If we stopped sampling due to an exception, re-raise it (now in the worker thread)
|
368
|
+
if (state->failure_exception != Qnil) rb_exc_raise(state->failure_exception);
|
369
|
+
|
370
|
+
return Qnil;
|
371
|
+
}
|
372
|
+
|
373
|
+
static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
|
374
|
+
return \
|
375
|
+
(active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
|
376
|
+
Qtrue : Qfalse;
|
377
|
+
}
|
378
|
+
|
379
|
+
static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
|
380
|
+
/* Does nothing on purpose */
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
|
384
|
+
install_sigprof_signal_handler(testing_signal_handler);
|
385
|
+
return Qtrue;
|
386
|
+
}
|
387
|
+
|
388
|
+
static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
|
389
|
+
remove_sigprof_signal_handler();
|
390
|
+
return Qtrue;
|
391
|
+
}
|
@@ -144,6 +144,8 @@ if RUBY_VERSION < '2.3'
|
|
144
144
|
$defs << '-DUSE_LEGACY_RB_PROFILE_FRAMES'
|
145
145
|
# ... you couldn't name threads
|
146
146
|
$defs << '-DNO_THREAD_NAMES'
|
147
|
+
# ...the ruby_thread_has_gvl_p function was not exposed to users outside of the VM
|
148
|
+
$defs << '-DNO_THREAD_HAS_GVL'
|
147
149
|
end
|
148
150
|
|
149
151
|
# If we got here, libdatadog is available and loaded
|
@@ -681,3 +681,12 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i
|
|
681
681
|
}
|
682
682
|
|
683
683
|
#endif // USE_LEGACY_RB_PROFILE_FRAMES
|
684
|
+
|
685
|
+
#ifdef NO_THREAD_HAS_GVL
|
686
|
+
int ruby_thread_has_gvl_p(void) {
|
687
|
+
// TODO: The CpuAndWallTimeWorker needs this function, but Ruby 2.2 doesn't expose it... For now this placeholder
|
688
|
+
// will enable the profiling native extension to continue to compile on Ruby 2.2, but the CpuAndWallTimeWorker will
|
689
|
+
// not work properly on 2.2. Will be addressed later.
|
690
|
+
return 0;
|
691
|
+
}
|
692
|
+
#endif // NO_THREAD_HAS_GVL
|
@@ -5,6 +5,7 @@
|
|
5
5
|
|
6
6
|
// Each class/module here is implemented in their separate file
|
7
7
|
void collectors_cpu_and_wall_time_init(VALUE profiling_module);
|
8
|
+
void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module);
|
8
9
|
void collectors_stack_init(VALUE profiling_module);
|
9
10
|
void http_transport_init(VALUE profiling_module);
|
10
11
|
void stack_recorder_init(VALUE profiling_module);
|
@@ -22,6 +23,7 @@ void DDTRACE_EXPORT Init_ddtrace_profiling_native_extension(void) {
|
|
22
23
|
rb_define_singleton_method(native_extension_module, "clock_id_for", clock_id_for, 1); // from clock_id.h
|
23
24
|
|
24
25
|
collectors_cpu_and_wall_time_init(profiling_module);
|
26
|
+
collectors_cpu_and_wall_time_worker_init(profiling_module);
|
25
27
|
collectors_stack_init(profiling_module);
|
26
28
|
http_transport_init(profiling_module);
|
27
29
|
stack_recorder_init(profiling_module);
|
@@ -324,8 +324,9 @@ static void *call_serialize_without_gvl(void *call_args) {
|
|
324
324
|
return NULL; // Unused
|
325
325
|
}
|
326
326
|
|
327
|
-
|
327
|
+
VALUE enforce_recorder_instance(VALUE object) {
|
328
328
|
Check_TypedStruct(object, &stack_recorder_typed_data);
|
329
|
+
return object;
|
329
330
|
}
|
330
331
|
|
331
332
|
static struct active_slot_pair sampler_lock_active_profile(struct stack_recorder_state *state) {
|
@@ -15,12 +15,13 @@
|
|
15
15
|
// ```
|
16
16
|
#define VALUE_STRING(string) {.ptr = "" string, .len = sizeof(string) - 1}
|
17
17
|
|
18
|
-
#define
|
19
|
-
#define
|
20
|
-
#define
|
21
|
-
#define
|
22
|
-
#define
|
23
|
-
#define
|
18
|
+
#define CPU_TIME_VALUE {.type_ = VALUE_STRING("cpu-time"), .unit = VALUE_STRING("nanoseconds")}
|
19
|
+
#define CPU_SAMPLES_VALUE {.type_ = VALUE_STRING("cpu-samples"), .unit = VALUE_STRING("count")}
|
20
|
+
#define WALL_TIME_VALUE {.type_ = VALUE_STRING("wall-time"), .unit = VALUE_STRING("nanoseconds")}
|
21
|
+
#define ALLOC_SIZE_VALUE {.type_ = VALUE_STRING("alloc-size"), .unit = VALUE_STRING("bytes")}
|
22
|
+
#define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")}
|
23
|
+
#define HEAP_LIVE_SIZE_VALUE {.type_ = VALUE_STRING("heap-live-size"), .unit = VALUE_STRING("bytes")}
|
24
|
+
#define HEAP_LIVE_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")}
|
24
25
|
|
25
26
|
static const ddprof_ffi_ValueType enabled_value_types[] = {
|
26
27
|
#define CPU_TIME_VALUE_POS 0
|
@@ -34,4 +35,4 @@ static const ddprof_ffi_ValueType enabled_value_types[] = {
|
|
34
35
|
#define ENABLED_VALUE_TYPES_COUNT (sizeof(enabled_value_types) / sizeof(ddprof_ffi_ValueType))
|
35
36
|
|
36
37
|
void record_sample(VALUE recorder_instance, ddprof_ffi_Sample sample);
|
37
|
-
|
38
|
+
VALUE enforce_recorder_instance(VALUE object);
|