ddtrace 1.3.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +40 -2
  3. data/README.md +1 -1
  4. data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +10 -1
  5. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +5 -4
  6. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
  7. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +391 -0
  8. data/ext/ddtrace_profiling_native_extension/extconf.rb +2 -0
  9. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +9 -0
  10. data/ext/ddtrace_profiling_native_extension/profiling.c +2 -0
  11. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +2 -1
  12. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +8 -7
  13. data/lib/datadog/ci/contrib/cucumber/integration.rb +1 -1
  14. data/lib/datadog/ci/contrib/rspec/integration.rb +1 -1
  15. data/lib/datadog/core/configuration/base.rb +9 -0
  16. data/lib/datadog/core/configuration/components.rb +26 -6
  17. data/lib/datadog/core/configuration/settings.rb +25 -0
  18. data/lib/datadog/core/configuration.rb +4 -1
  19. data/lib/datadog/core/telemetry/client.rb +79 -0
  20. data/lib/datadog/core/telemetry/collector.rb +234 -0
  21. data/lib/datadog/core/telemetry/emitter.rb +48 -0
  22. data/lib/datadog/core/telemetry/event.rb +71 -0
  23. data/lib/datadog/core/telemetry/ext.rb +11 -0
  24. data/lib/datadog/core/telemetry/heartbeat.rb +37 -0
  25. data/lib/datadog/core/telemetry/http/adapters/net.rb +113 -0
  26. data/lib/datadog/core/telemetry/http/env.rb +20 -0
  27. data/lib/datadog/core/telemetry/http/ext.rb +20 -0
  28. data/lib/datadog/core/telemetry/http/response.rb +68 -0
  29. data/lib/datadog/core/telemetry/http/transport.rb +53 -0
  30. data/lib/datadog/core/telemetry/v1/app_event.rb +52 -0
  31. data/lib/datadog/core/telemetry/v1/application.rb +86 -0
  32. data/lib/datadog/core/telemetry/v1/configuration.rb +25 -0
  33. data/lib/datadog/core/telemetry/v1/dependency.rb +36 -0
  34. data/lib/datadog/core/telemetry/v1/host.rb +51 -0
  35. data/lib/datadog/core/telemetry/v1/integration.rb +58 -0
  36. data/lib/datadog/core/telemetry/v1/product.rb +28 -0
  37. data/lib/datadog/core/telemetry/v1/telemetry_request.rb +100 -0
  38. data/lib/datadog/core/utils/sequence.rb +5 -0
  39. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +74 -0
  40. data/lib/datadog/profiling/stack_recorder.rb +1 -1
  41. data/lib/datadog/profiling.rb +1 -0
  42. data/lib/datadog/tracing/contrib/extensions.rb +2 -0
  43. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +9 -0
  44. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  45. data/lib/datadog/tracing/contrib/patcher.rb +11 -0
  46. data/lib/datadog/tracing/contrib/rack/patcher.rb +8 -0
  47. data/lib/datadog/tracing/trace_operation.rb +1 -1
  48. data/lib/ddtrace/auto_instrument.rb +7 -0
  49. data/lib/ddtrace/transport/ext.rb +0 -1
  50. data/lib/ddtrace/transport/http/adapters/net.rb +1 -0
  51. data/lib/ddtrace/version.rb +2 -2
  52. metadata +26 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d512b6ba74cbbe3c0b879a8efd29a513680dfd1ce9864c785887717ababf7880
4
- data.tar.gz: 637d702bc35996f24d28904f20d77e4c6dece600c98eec6bbc47ba72ba7a96e2
3
+ metadata.gz: '01987691359b14541248048bb2f720031b4f0f0ca0be8fade0f6d0daf7aef8cc'
4
+ data.tar.gz: b2cce425177f3d619d0064d88ffb52073515b8d390e054117a160eac21e939d5
5
5
  SHA512:
6
- metadata.gz: bee0a447195f79d8a551f1c5076f283601c4ea14ec049f6dc077213ae578d0bdd5a0456f5b6d741a719d7543cc161b3da79970a5d696d0e2cd1e068166d9954d
7
- data.tar.gz: da1801a3e6cdf5b559bca925126e68add53293b0a581666302020fdeb6dea3a27e7a4d70ead568f24a9158184f957cd25e6c3ae3b3c267b4d467ff96114e3c8a
6
+ metadata.gz: 7232b588ee13d8a47a1fccbe39a8753a19a4e9c3b0b2dee2cb278a64b183500ec9859cfecffda8c128971bf88c93b148e6f915c2d7906139d1df050b173df42c
7
+ data.tar.gz: 2605640bffc586a0fd8bb38139d49fa6dad9f04bfe91a0e672cdd04a6c8a8a36896be5322a4b78ecab37175cd600bdc144109646712265b49bad4e103cc1ab5e
data/CHANGELOG.md CHANGED
@@ -2,6 +2,32 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [1.4.1] - 2022-09-15
6
+
7
+ ### Fixed
8
+
9
+ * Missing distributed traces when trace is dropped by priority sampling ([#2101][], [#2279][])
10
+ * Profiling support when Ruby is compiled without a shared library ([#2250][])
11
+
12
+ ## [1.4.0] - 2022-08-25
13
+
14
+ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.4.0
15
+
16
+ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
17
+
18
+ ### Added
19
+
20
+ * gRPC: tag `grpc.client.deadline` ([#2200][])
21
+ * Implement telemetry, disable by default ([#2153][])
22
+
23
+ ### Changed
24
+
25
+ * Bump `libdatadog` dependency version ([#2229][])
26
+
27
+ ### Fixed
28
+
29
+ * Fix CI instrumentation configuration ([#2219][])
30
+
5
31
  ## [1.3.0] - 2022-08-04
6
32
 
7
33
  Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.3.0
@@ -2071,7 +2097,11 @@ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v0.3.1
2071
2097
 
2072
2098
  Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2073
2099
 
2074
- [Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.1.0...master
2100
+ [Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.1...master
2101
+ [1.4.1]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.0...v1.4.1
2102
+ [1.4.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
2103
+ [1.3.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.2.0...v1.3.0
2104
+ [1.2.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.1.0...v1.2.0
2075
2105
  [1.1.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0...v1.1.0
2076
2106
  [1.0.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta2...v1.0.0
2077
2107
  [1.0.0.beta2]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta1...v1.0.0.beta2
@@ -2931,6 +2961,7 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2931
2961
  [#2082]: https://github.com/DataDog/dd-trace-rb/issues/2082
2932
2962
  [#2096]: https://github.com/DataDog/dd-trace-rb/issues/2096
2933
2963
  [#2097]: https://github.com/DataDog/dd-trace-rb/issues/2097
2964
+ [#2101]: https://github.com/DataDog/dd-trace-rb/issues/2101
2934
2965
  [#2110]: https://github.com/DataDog/dd-trace-rb/issues/2110
2935
2966
  [#2113]: https://github.com/DataDog/dd-trace-rb/issues/2113
2936
2967
  [#2118]: https://github.com/DataDog/dd-trace-rb/issues/2118
@@ -2939,13 +2970,20 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2939
2970
  [#2138]: https://github.com/DataDog/dd-trace-rb/issues/2138
2940
2971
  [#2140]: https://github.com/DataDog/dd-trace-rb/issues/2140
2941
2972
  [#2150]: https://github.com/DataDog/dd-trace-rb/issues/2150
2973
+ [#2153]: https://github.com/DataDog/dd-trace-rb/issues/2153
2942
2974
  [#2158]: https://github.com/DataDog/dd-trace-rb/issues/2158
2943
2975
  [#2162]: https://github.com/DataDog/dd-trace-rb/issues/2162
2944
2976
  [#2163]: https://github.com/DataDog/dd-trace-rb/issues/2163
2977
+ [#2170]: https://github.com/DataDog/dd-trace-rb/issues/2170
2945
2978
  [#2173]: https://github.com/DataDog/dd-trace-rb/issues/2173
2946
2979
  [#2174]: https://github.com/DataDog/dd-trace-rb/issues/2174
2947
2980
  [#2180]: https://github.com/DataDog/dd-trace-rb/issues/2180
2981
+ [#2200]: https://github.com/DataDog/dd-trace-rb/issues/2200
2948
2982
  [#2201]: https://github.com/DataDog/dd-trace-rb/issues/2201
2983
+ [#2219]: https://github.com/DataDog/dd-trace-rb/issues/2219
2984
+ [#2229]: https://github.com/DataDog/dd-trace-rb/issues/2229
2985
+ [#2250]: https://github.com/DataDog/dd-trace-rb/issues/2250
2986
+ [#2279]: https://github.com/DataDog/dd-trace-rb/issues/2279
2949
2987
  [@AdrianLC]: https://github.com/AdrianLC
2950
2988
  [@Azure7111]: https://github.com/Azure7111
2951
2989
  [@BabyGroot]: https://github.com/BabyGroot
@@ -3087,4 +3125,4 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
3087
3125
  [@walterking]: https://github.com/walterking
3088
3126
  [@y-yagi]: https://github.com/y-yagi
3089
3127
  [@yukimurasawa]: https://github.com/yukimurasawa
3090
- [@zachmccormick]: https://github.com/zachmccormick
3128
+ [@zachmccormick]: https://github.com/zachmccormick
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- ***Version 1.0.0 has recently been released. Check out our [upgrade guide](https://github.com/DataDog/dd-trace-rb/blob/master/docs/UpgradeGuide.md#from-0x-to-10) for more details.***
1
+ **We've recently released the 1.x version series. If you're upgrading from a 0.x version, check out our [upgrade guide](https://github.com/DataDog/dd-trace-rb/blob/master/docs/UpgradeGuide.md#from-0x-to-10).**
2
2
 
3
3
  # Datadog Trace Client
4
4
 
@@ -85,7 +85,16 @@ static bool failed_to_load(void *handle, VALUE *failure_details) {
85
85
  static bool incompatible_library(void *handle, VALUE *failure_details) {
86
86
  // The library being loaded may be linked to a different libruby than the current executing Ruby.
87
87
  // We check if this is the case by checking if a well-known symbol resolves to a common address.
88
- if (dlsym(handle, "ruby_xmalloc") != &ruby_xmalloc) {
88
+
89
+ void *xmalloc_from_library = dlsym(handle, "ruby_xmalloc");
90
+
91
+ if (xmalloc_from_library == NULL) {
92
+ // This happens when ruby is built without a `libruby.so` by using `--disable-shared` at compilation time.
93
+ // In this situation, no conflict between libruby version is possible.
94
+ return false;
95
+ }
96
+
97
+ if (xmalloc_from_library != &ruby_xmalloc) {
89
98
  *failure_details = rb_str_new_cstr("library was compiled and linked to a different Ruby version");
90
99
  unload_failed_library(handle);
91
100
  return true;
@@ -153,8 +153,6 @@ static VALUE _native_new(VALUE klass) {
153
153
  }
154
154
 
155
155
  static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames) {
156
- enforce_recorder_instance(recorder_instance);
157
-
158
156
  struct cpu_and_wall_time_collector_state *state;
159
157
  TypedData_Get_Struct(collector_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
160
158
 
@@ -164,7 +162,7 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst
164
162
  // Update this when modifying state struct
165
163
  state->sampling_buffer = sampling_buffer_new(max_frames_requested);
166
164
  // hash_map_per_thread_context is already initialized, nothing to do here
167
- state->recorder_instance = recorder_instance;
165
+ state->recorder_instance = enforce_recorder_instance(recorder_instance);
168
166
 
169
167
  return Qtrue;
170
168
  }
@@ -180,6 +178,8 @@ static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance
180
178
  //
181
179
  // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
182
180
  // Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
181
+ // Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
182
+ // Assumption 4: This function IS NOT called in a reentrant way.
183
183
  VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
184
184
  struct cpu_and_wall_time_collector_state *state;
185
185
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
@@ -384,6 +384,7 @@ static long thread_id_for(VALUE thread) {
384
384
  return FIXNUM_P(object_id) ? FIX2LONG(object_id) : -1;
385
385
  }
386
386
 
387
- void enforce_cpu_and_wall_time_collector_instance(VALUE object) {
387
+ VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object) {
388
388
  Check_TypedStruct(object, &cpu_and_wall_time_collector_typed_data);
389
+ return object;
389
390
  }
@@ -3,4 +3,4 @@
3
3
  #include <ruby.h>
4
4
 
5
5
  VALUE cpu_and_wall_time_collector_sample(VALUE self_instance);
6
- void enforce_cpu_and_wall_time_collector_instance(VALUE object);
6
+ VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object);
@@ -0,0 +1,391 @@
1
+ #include <ruby.h>
2
+ #include <ruby/thread.h>
3
+ #include <ruby/thread_native.h>
4
+ #include <ruby/debug.h>
5
+ #include <stdbool.h>
6
+ #include <signal.h>
7
+ #include "helpers.h"
8
+ #include "ruby_helpers.h"
9
+ #include "collectors_cpu_and_wall_time.h"
10
+ #include "private_vm_api_access.h"
11
+
12
+ // Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
13
+ // itself; this class only implements the "doing it periodically" part.
14
+ //
15
+ // This file implements the native bits of the Datadog::Profiling::Collectors::CpuAndWallTimeWorker class
16
+
17
+ // ---
18
+ // Here be dragons: This component is quite fiddly and probably one of the more complex in the profiler as it deals with
19
+ // multiple threads, signal handlers, global state, etc.
20
+ //
21
+ // ## Design notes for this class:
22
+ //
23
+ // ### Constraints
24
+ //
25
+ // Currently, sampling Ruby threads requires calling Ruby VM APIs that are only safe to call while holding on to the
26
+ // global VM lock (and are not async-signal safe -- cannot be called from a signal handler).
27
+ //
28
+ // @ivoanjo: As a note, I don't think we should think of this constraint as set in stone. Since can reach into the Ruby
29
+ // internals, we may be able to figure out a way of overcoming it. But it's definitely going to be hard so for now
30
+ // we're considering it as a given.
31
+ //
32
+ // ### Flow for triggering samples
33
+ //
34
+ // The flow for triggering samples is as follows:
35
+ //
36
+ // 1. Inside the `run_sampling_trigger_loop` function (running in the `CpuAndWallTimeWorker` background thread),
37
+ // a `SIGPROF` signal gets sent to the current process.
38
+ //
39
+ // 2. The `handle_sampling_signal` signal handler function gets called to handle the `SIGPROF` signal.
40
+ //
41
+ // Which thread the signal handler function gets called on by the operating system is quite important. We need to perform
42
+ // an operation -- calling the `rb_postponed_job_register_one` API -- that can only be called from the thread that
43
+ // is holding on to the global VM lock. So this is the thread we're "hoping" our signal lands on.
44
+ //
45
+ // The signal never lands on the `CpuAndWallTimeWorker` background thread because we explicitly block it off from that
46
+ // thread in `block_sigprof_signal_handler_from_running_in_current_thread`.
47
+ //
48
+ // If the signal lands on a thread that is not holding onto the global VM lock, we can't proceed to the next step,
49
+ // and we need to restart the sampling flow from step 1. (There's still quite a few improvements we can make here,
50
+ // but this is the current state of the implementation).
51
+ //
52
+ // 3. Inside `handle_sampling_signal`, if it's getting executed by the Ruby thread that is holding the global VM lock,
53
+ // we can call `rb_postponed_job_register_one` to ask the Ruby VM to call our `sample_from_postponed_job` function
54
+ // "as soon as it can".
55
+ //
56
+ // 4. The Ruby VM calls our `sample_from_postponed_job` from a thread holding the global VM lock. A sample is recorded by
57
+ // calling `cpu_and_wall_time_collector_sample`.
58
+ //
59
+ // ---
60
+
61
+ // Contains state for a single CpuAndWallTimeWorker instance
62
+ struct cpu_and_wall_time_worker_state {
63
+ // Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
64
+ // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
65
+ // atomic operations. stdatomic.h seems a nice thing to reach out for.
66
+ volatile bool should_run;
67
+
68
+ VALUE cpu_and_wall_time_collector_instance;
69
+ // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
70
+ // the CpuAndWallTimeWorker thread
71
+ VALUE failure_exception;
72
+ };
73
+
74
+ static VALUE _native_new(VALUE klass);
75
+ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
76
+ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
77
+ static VALUE _native_sampling_loop(VALUE self, VALUE instance);
78
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
79
+ static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
80
+ static void remove_sigprof_signal_handler(void);
81
+ static void block_sigprof_signal_handler_from_running_in_current_thread(void);
82
+ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
83
+ static void *run_sampling_trigger_loop(void *state_ptr);
84
+ static void interrupt_sampling_trigger_loop(void *state_ptr);
85
+ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
86
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
87
+ static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
88
+ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
89
+ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
90
+ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
91
+ static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
92
+ static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
93
+
94
+ // Global state -- be very careful when accessing or modifying it
95
+
96
+ // Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
97
+ // The active_sampler_instance needs to be global because we access it from the signal handler.
98
+ static VALUE active_sampler_instance = Qnil;
99
+ // ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
100
+ // to detect when it is outdated)
101
+ static VALUE active_sampler_owner_thread = Qnil;
102
+
103
+ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
104
+ rb_global_variable(&active_sampler_instance);
105
+ rb_global_variable(&active_sampler_owner_thread);
106
+
107
+ VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
108
+ VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
109
+ // Hosts methods used for testing the native code using RSpec
110
+ VALUE testing_module = rb_define_module_under(collectors_cpu_and_wall_time_worker_class, "Testing");
111
+
112
+ // Instances of the CpuAndWallTimeWorker class are "TypedData" objects.
113
+ // "TypedData" objects are special objects in the Ruby VM that can wrap C structs.
114
+ // In this case, it wraps the cpu_and_wall_time_worker_state.
115
+ //
116
+ // Because Ruby doesn't know how to initialize native-level structs, we MUST override the allocation function for objects
117
+ // of this class so that we can manage this part. Not overriding or disabling the allocation function is a common
118
+ // gotcha for "TypedData" objects that can very easily lead to VM crashes, see for instance
119
+ // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
120
+ rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
121
+
122
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
123
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
124
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
125
+ rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
126
+ rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
127
+ rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
128
+ rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
129
+ }
130
+
131
+ // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
132
+ // See also https://github.com/ruby/ruby/blob/master/doc/extension.rdoc for how this works
133
+ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
134
+ .wrap_struct_name = "Datadog::Profiling::Collectors::CpuAndWallTimeWorker",
135
+ .function = {
136
+ .dmark = cpu_and_wall_time_worker_typed_data_mark,
137
+ .dfree = RUBY_DEFAULT_FREE,
138
+ .dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
139
+ //.dcompact = NULL, // FIXME: Add support for compaction
140
+ },
141
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
142
+ };
143
+
144
+ static VALUE _native_new(VALUE klass) {
145
+ struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
146
+
147
+ state->should_run = false;
148
+ state->cpu_and_wall_time_collector_instance = Qnil;
149
+ state->failure_exception = Qnil;
150
+
151
+ return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
152
+ }
153
+
154
+ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
155
+ struct cpu_and_wall_time_worker_state *state;
156
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
157
+
158
+ state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
159
+
160
+ return Qtrue;
161
+ }
162
+
163
+ // Since our state contains references to Ruby objects, we need to tell the Ruby GC about them
164
+ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
165
+ struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
166
+
167
+ rb_gc_mark(state->cpu_and_wall_time_collector_instance);
168
+ rb_gc_mark(state->failure_exception);
169
+ }
170
+
171
+ // Called in a background thread created in CpuAndWallTimeWorker#start
172
+ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
173
+ struct cpu_and_wall_time_worker_state *state;
174
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
175
+
176
+ if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
177
+ rb_raise(
178
+ rb_eRuntimeError,
179
+ "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
180
+ );
181
+ }
182
+
183
+ // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
184
+ active_sampler_instance = instance;
185
+ active_sampler_owner_thread = rb_thread_current();
186
+
187
+ state->should_run = true;
188
+
189
+ block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
190
+
191
+ install_sigprof_signal_handler(handle_sampling_signal);
192
+
193
+ // Release GVL, get to the actual work!
194
+ int exception_state;
195
+ rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
196
+
197
+ // The sample trigger loop finished (either cleanly or with an error); let's clean up
198
+
199
+ remove_sigprof_signal_handler();
200
+ active_sampler_instance = Qnil;
201
+ active_sampler_owner_thread = Qnil;
202
+
203
+ // Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
204
+ RB_GC_GUARD(instance);
205
+
206
+ if (exception_state) rb_jump_tag(exception_state); // Re-raise any exception that happened
207
+
208
+ return Qnil;
209
+ }
210
+
211
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
212
+ struct cpu_and_wall_time_worker_state *state;
213
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
214
+
215
+ state->should_run = false;
216
+
217
+ return Qtrue;
218
+ }
219
+
220
+ static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)) {
221
+ struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
222
+ struct sigaction signal_handler_config = {
223
+ .sa_flags = SA_RESTART | SA_SIGINFO,
224
+ .sa_sigaction = signal_handler_function
225
+ };
226
+ sigemptyset(&signal_handler_config.sa_mask);
227
+
228
+ if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) {
229
+ rb_sys_fail("Could not start CpuAndWallTimeWorker: Could not install signal handler");
230
+ }
231
+
232
+ // In some corner cases (e.g. after a fork), our signal handler may still be around, and that's ok
233
+ if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) return;
234
+
235
+ if (existing_signal_handler_config.sa_handler != NULL || existing_signal_handler_config.sa_sigaction != NULL) {
236
+ // A previous signal handler already existed. Currently we don't support this situation, so let's just back out
237
+ // of the installation.
238
+
239
+ if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) {
240
+ rb_sys_fail(
241
+ "Could not start CpuAndWallTimeWorker: Could not re-install pre-existing SIGPROF signal handler. " \
242
+ "This may break the component had installed it."
243
+ );
244
+ }
245
+
246
+ rb_raise(rb_eRuntimeError, "Could not start CpuAndWallTimeWorker: There's a pre-existing SIGPROF signal handler");
247
+ }
248
+ }
249
+
250
+ static void remove_sigprof_signal_handler(void) {
251
+ struct sigaction signal_handler_config = {
252
+ .sa_handler = SIG_DFL, // Reset back to default
253
+ .sa_flags = SA_RESTART // TODO: Unclear if this is actually needed/does anything at all
254
+ };
255
+ sigemptyset(&signal_handler_config.sa_mask);
256
+
257
+ if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
258
+ }
259
+
260
+ static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
261
+ sigset_t signals_to_block;
262
+ sigemptyset(&signals_to_block);
263
+ sigaddset(&signals_to_block, SIGPROF);
264
+ pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
265
+ }
266
+
267
+ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
268
+ if (!ruby_thread_has_gvl_p()) {
269
+ return; // Not safe to enqueue a sample from this thread
270
+ }
271
+
272
+ // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
273
+ // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
274
+ // b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
275
+ // because it will not have the global VM lock
276
+ // TODO: Validate that this does not impact Ractors
277
+
278
+ // Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
279
+ // then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
280
+ // may want to implement that check ourselves.
281
+
282
+ // TODO: Do something with result (potentially update tracking counters?)
283
+ /*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
284
+ }
285
+
286
+ // The actual sampling trigger loop always runs **without** the global vm lock.
287
+ static void *run_sampling_trigger_loop(void *state_ptr) {
288
+ struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
289
+
290
+ struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
291
+
292
+ while (state->should_run) {
293
+ // TODO: This is still a placeholder for a more complex mechanism. In particular:
294
+ // * We want to signal a particular thread or threads, not the process in general
295
+ // * We want to track if a signal landed on the thread holding the global VM lock and do something about it
296
+ // * We want to do more than having a fixed sampling rate
297
+
298
+ kill(getpid(), SIGPROF);
299
+ nanosleep(&time_between_signals, NULL);
300
+ }
301
+
302
+ return NULL; // Unused
303
+ }
304
+
305
+ // This is called by the Ruby VM when it wants to shut down the background thread
306
+ static void interrupt_sampling_trigger_loop(void *state_ptr) {
307
+ struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
308
+
309
+ state->should_run = false;
310
+ }
311
+
312
+ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
313
+ VALUE instance = active_sampler_instance; // Read from global variable
314
+
315
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
316
+ if (instance == Qnil) return;
317
+
318
+ struct cpu_and_wall_time_worker_state *state;
319
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
320
+
321
+ // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
322
+ VALUE (*function_to_call_safely)(VALUE) = cpu_and_wall_time_collector_sample;
323
+ VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
324
+ VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
325
+ VALUE exception_handler_function_arg = instance;
326
+ rb_rescue2(
327
+ function_to_call_safely,
328
+ function_to_call_safely_arg,
329
+ exception_handler_function,
330
+ exception_handler_function_arg,
331
+ rb_eException, // rb_eException is the base class of all Ruby exceptions
332
+ 0 // Required by API to be the last argument
333
+ );
334
+ }
335
+
336
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
337
+ struct cpu_and_wall_time_worker_state *state;
338
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
339
+
340
+ state->should_run = false;
341
+ state->failure_exception = exception;
342
+
343
+ return Qnil;
344
+ }
345
+
346
+ static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
347
+ struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
348
+ if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
349
+ rb_sys_fail("Failed to probe existing handler");
350
+ }
351
+
352
+ if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) {
353
+ return ID2SYM(rb_intern("profiling"));
354
+ } else if (existing_signal_handler_config.sa_sigaction != NULL) {
355
+ return ID2SYM(rb_intern("other"));
356
+ } else {
357
+ return Qnil;
358
+ }
359
+ }
360
+
361
+ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
362
+ struct cpu_and_wall_time_worker_state *state;
363
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
364
+
365
+ rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
366
+
367
+ // If we stopped sampling due to an exception, re-raise it (now in the worker thread)
368
+ if (state->failure_exception != Qnil) rb_exc_raise(state->failure_exception);
369
+
370
+ return Qnil;
371
+ }
372
+
373
+ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
374
+ return \
375
+ (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
376
+ Qtrue : Qfalse;
377
+ }
378
+
379
+ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
380
+ /* Does nothing on purpose */
381
+ }
382
+
383
+ static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
384
+ install_sigprof_signal_handler(testing_signal_handler);
385
+ return Qtrue;
386
+ }
387
+
388
+ static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
389
+ remove_sigprof_signal_handler();
390
+ return Qtrue;
391
+ }
@@ -144,6 +144,8 @@ if RUBY_VERSION < '2.3'
144
144
  $defs << '-DUSE_LEGACY_RB_PROFILE_FRAMES'
145
145
  # ... you couldn't name threads
146
146
  $defs << '-DNO_THREAD_NAMES'
147
+ # ...the ruby_thread_has_gvl_p function was not exposed to users outside of the VM
148
+ $defs << '-DNO_THREAD_HAS_GVL'
147
149
  end
148
150
 
149
151
  # If we got here, libdatadog is available and loaded
@@ -681,3 +681,12 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i
681
681
  }
682
682
 
683
683
  #endif // USE_LEGACY_RB_PROFILE_FRAMES
684
+
685
+ #ifdef NO_THREAD_HAS_GVL
686
+ int ruby_thread_has_gvl_p(void) {
687
+ // TODO: The CpuAndWallTimeWorker needs this function, but Ruby 2.2 doesn't expose it... For now this placeholder
688
+ // will enable the profiling native extension to continue to compile on Ruby 2.2, but the CpuAndWallTimeWorker will
689
+ // not work properly on 2.2. Will be addressed later.
690
+ return 0;
691
+ }
692
+ #endif // NO_THREAD_HAS_GVL
@@ -5,6 +5,7 @@
5
5
 
6
6
  // Each class/module here is implemented in their separate file
7
7
  void collectors_cpu_and_wall_time_init(VALUE profiling_module);
8
+ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module);
8
9
  void collectors_stack_init(VALUE profiling_module);
9
10
  void http_transport_init(VALUE profiling_module);
10
11
  void stack_recorder_init(VALUE profiling_module);
@@ -22,6 +23,7 @@ void DDTRACE_EXPORT Init_ddtrace_profiling_native_extension(void) {
22
23
  rb_define_singleton_method(native_extension_module, "clock_id_for", clock_id_for, 1); // from clock_id.h
23
24
 
24
25
  collectors_cpu_and_wall_time_init(profiling_module);
26
+ collectors_cpu_and_wall_time_worker_init(profiling_module);
25
27
  collectors_stack_init(profiling_module);
26
28
  http_transport_init(profiling_module);
27
29
  stack_recorder_init(profiling_module);
@@ -324,8 +324,9 @@ static void *call_serialize_without_gvl(void *call_args) {
324
324
  return NULL; // Unused
325
325
  }
326
326
 
327
- void enforce_recorder_instance(VALUE object) {
327
+ VALUE enforce_recorder_instance(VALUE object) {
328
328
  Check_TypedStruct(object, &stack_recorder_typed_data);
329
+ return object;
329
330
  }
330
331
 
331
332
  static struct active_slot_pair sampler_lock_active_profile(struct stack_recorder_state *state) {
@@ -15,12 +15,13 @@
15
15
  // ```
16
16
  #define VALUE_STRING(string) {.ptr = "" string, .len = sizeof(string) - 1}
17
17
 
18
- #define CPU_TIME_VALUE {.type_ = VALUE_STRING("cpu-time"), .unit = VALUE_STRING("nanoseconds")}
19
- #define CPU_SAMPLES_VALUE {.type_ = VALUE_STRING("cpu-samples"), .unit = VALUE_STRING("count")}
20
- #define WALL_TIME_VALUE {.type_ = VALUE_STRING("wall-time"), .unit = VALUE_STRING("nanoseconds")}
21
- #define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")}
22
- #define ALLOC_SPACE_VALUE {.type_ = VALUE_STRING("alloc-space"), .unit = VALUE_STRING("bytes")}
23
- #define HEAP_SPACE_VALUE {.type_ = VALUE_STRING("heap-space"), .unit = VALUE_STRING("bytes")}
18
+ #define CPU_TIME_VALUE {.type_ = VALUE_STRING("cpu-time"), .unit = VALUE_STRING("nanoseconds")}
19
+ #define CPU_SAMPLES_VALUE {.type_ = VALUE_STRING("cpu-samples"), .unit = VALUE_STRING("count")}
20
+ #define WALL_TIME_VALUE {.type_ = VALUE_STRING("wall-time"), .unit = VALUE_STRING("nanoseconds")}
21
+ #define ALLOC_SIZE_VALUE {.type_ = VALUE_STRING("alloc-size"), .unit = VALUE_STRING("bytes")}
22
+ #define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")}
23
+ #define HEAP_LIVE_SIZE_VALUE {.type_ = VALUE_STRING("heap-live-size"), .unit = VALUE_STRING("bytes")}
24
+ #define HEAP_LIVE_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")}
24
25
 
25
26
  static const ddprof_ffi_ValueType enabled_value_types[] = {
26
27
  #define CPU_TIME_VALUE_POS 0
@@ -34,4 +35,4 @@ static const ddprof_ffi_ValueType enabled_value_types[] = {
34
35
  #define ENABLED_VALUE_TYPES_COUNT (sizeof(enabled_value_types) / sizeof(ddprof_ffi_ValueType))
35
36
 
36
37
  void record_sample(VALUE recorder_instance, ddprof_ffi_Sample sample);
37
- void enforce_recorder_instance(VALUE object);
38
+ VALUE enforce_recorder_instance(VALUE object);
@@ -35,7 +35,7 @@ module Datadog
35
35
  false
36
36
  end
37
37
 
38
- def default_configuration
38
+ def new_configuration
39
39
  Configuration::Settings.new
40
40
  end
41
41
 
@@ -36,7 +36,7 @@ module Datadog
36
36
  false
37
37
  end
38
38
 
39
- def default_configuration
39
+ def new_configuration
40
40
  Configuration::Settings.new
41
41
  end
42
42