ddtrace 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +40 -2
  3. data/README.md +1 -1
  4. data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +10 -1
  5. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +5 -4
  6. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
  7. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +391 -0
  8. data/ext/ddtrace_profiling_native_extension/extconf.rb +2 -0
  9. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +9 -0
  10. data/ext/ddtrace_profiling_native_extension/profiling.c +2 -0
  11. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +2 -1
  12. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +8 -7
  13. data/lib/datadog/ci/contrib/cucumber/integration.rb +1 -1
  14. data/lib/datadog/ci/contrib/rspec/integration.rb +1 -1
  15. data/lib/datadog/core/configuration/base.rb +9 -0
  16. data/lib/datadog/core/configuration/components.rb +26 -6
  17. data/lib/datadog/core/configuration/settings.rb +25 -0
  18. data/lib/datadog/core/configuration.rb +4 -1
  19. data/lib/datadog/core/telemetry/client.rb +79 -0
  20. data/lib/datadog/core/telemetry/collector.rb +234 -0
  21. data/lib/datadog/core/telemetry/emitter.rb +48 -0
  22. data/lib/datadog/core/telemetry/event.rb +71 -0
  23. data/lib/datadog/core/telemetry/ext.rb +11 -0
  24. data/lib/datadog/core/telemetry/heartbeat.rb +37 -0
  25. data/lib/datadog/core/telemetry/http/adapters/net.rb +113 -0
  26. data/lib/datadog/core/telemetry/http/env.rb +20 -0
  27. data/lib/datadog/core/telemetry/http/ext.rb +20 -0
  28. data/lib/datadog/core/telemetry/http/response.rb +68 -0
  29. data/lib/datadog/core/telemetry/http/transport.rb +53 -0
  30. data/lib/datadog/core/telemetry/v1/app_event.rb +52 -0
  31. data/lib/datadog/core/telemetry/v1/application.rb +86 -0
  32. data/lib/datadog/core/telemetry/v1/configuration.rb +25 -0
  33. data/lib/datadog/core/telemetry/v1/dependency.rb +36 -0
  34. data/lib/datadog/core/telemetry/v1/host.rb +51 -0
  35. data/lib/datadog/core/telemetry/v1/integration.rb +58 -0
  36. data/lib/datadog/core/telemetry/v1/product.rb +28 -0
  37. data/lib/datadog/core/telemetry/v1/telemetry_request.rb +100 -0
  38. data/lib/datadog/core/utils/sequence.rb +5 -0
  39. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +74 -0
  40. data/lib/datadog/profiling/stack_recorder.rb +1 -1
  41. data/lib/datadog/profiling.rb +1 -0
  42. data/lib/datadog/tracing/contrib/extensions.rb +2 -0
  43. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +9 -0
  44. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  45. data/lib/datadog/tracing/contrib/patcher.rb +11 -0
  46. data/lib/datadog/tracing/contrib/rack/patcher.rb +8 -0
  47. data/lib/datadog/tracing/trace_operation.rb +1 -1
  48. data/lib/ddtrace/auto_instrument.rb +7 -0
  49. data/lib/ddtrace/transport/ext.rb +0 -1
  50. data/lib/ddtrace/transport/http/adapters/net.rb +1 -0
  51. data/lib/ddtrace/version.rb +2 -2
  52. metadata +26 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d512b6ba74cbbe3c0b879a8efd29a513680dfd1ce9864c785887717ababf7880
4
- data.tar.gz: 637d702bc35996f24d28904f20d77e4c6dece600c98eec6bbc47ba72ba7a96e2
3
+ metadata.gz: '01987691359b14541248048bb2f720031b4f0f0ca0be8fade0f6d0daf7aef8cc'
4
+ data.tar.gz: b2cce425177f3d619d0064d88ffb52073515b8d390e054117a160eac21e939d5
5
5
  SHA512:
6
- metadata.gz: bee0a447195f79d8a551f1c5076f283601c4ea14ec049f6dc077213ae578d0bdd5a0456f5b6d741a719d7543cc161b3da79970a5d696d0e2cd1e068166d9954d
7
- data.tar.gz: da1801a3e6cdf5b559bca925126e68add53293b0a581666302020fdeb6dea3a27e7a4d70ead568f24a9158184f957cd25e6c3ae3b3c267b4d467ff96114e3c8a
6
+ metadata.gz: 7232b588ee13d8a47a1fccbe39a8753a19a4e9c3b0b2dee2cb278a64b183500ec9859cfecffda8c128971bf88c93b148e6f915c2d7906139d1df050b173df42c
7
+ data.tar.gz: 2605640bffc586a0fd8bb38139d49fa6dad9f04bfe91a0e672cdd04a6c8a8a36896be5322a4b78ecab37175cd600bdc144109646712265b49bad4e103cc1ab5e
data/CHANGELOG.md CHANGED
@@ -2,6 +2,32 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [1.4.1] - 2022-09-15
6
+
7
+ ### Fixed
8
+
9
+ * Missing distributed traces when trace is dropped by priority sampling ([#2101][], [#2279][])
10
+ * Profiling support when Ruby is compiled without a shared library ([#2250][])
11
+
12
+ ## [1.4.0] - 2022-08-25
13
+
14
+ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.4.0
15
+
16
+ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
17
+
18
+ ### Added
19
+
20
+ * gRPC: tag `grpc.client.deadline` ([#2200][])
21
+ * Implement telemetry, disable by default ([#2153][])
22
+
23
+ ### Changed
24
+
25
+ * Bump `libdatadog` dependency version ([#2229][])
26
+
27
+ ### Fixed
28
+
29
+ * Fix CI instrumentation configuration ([#2219][])
30
+
5
31
  ## [1.3.0] - 2022-08-04
6
32
 
7
33
  Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v1.3.0
@@ -2071,7 +2097,11 @@ Release notes: https://github.com/DataDog/dd-trace-rb/releases/tag/v0.3.1
2071
2097
 
2072
2098
  Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2073
2099
 
2074
- [Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.1.0...master
2100
+ [Unreleased]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.1...master
2101
+ [1.4.1]: https://github.com/DataDog/dd-trace-rb/compare/v1.4.0...v1.4.1
2102
+ [1.4.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.3.0...v1.4.0
2103
+ [1.3.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.2.0...v1.3.0
2104
+ [1.2.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.1.0...v1.2.0
2075
2105
  [1.1.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0...v1.1.0
2076
2106
  [1.0.0]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta2...v1.0.0
2077
2107
  [1.0.0.beta2]: https://github.com/DataDog/dd-trace-rb/compare/v1.0.0.beta1...v1.0.0.beta2
@@ -2931,6 +2961,7 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2931
2961
  [#2082]: https://github.com/DataDog/dd-trace-rb/issues/2082
2932
2962
  [#2096]: https://github.com/DataDog/dd-trace-rb/issues/2096
2933
2963
  [#2097]: https://github.com/DataDog/dd-trace-rb/issues/2097
2964
+ [#2101]: https://github.com/DataDog/dd-trace-rb/issues/2101
2934
2965
  [#2110]: https://github.com/DataDog/dd-trace-rb/issues/2110
2935
2966
  [#2113]: https://github.com/DataDog/dd-trace-rb/issues/2113
2936
2967
  [#2118]: https://github.com/DataDog/dd-trace-rb/issues/2118
@@ -2939,13 +2970,20 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
2939
2970
  [#2138]: https://github.com/DataDog/dd-trace-rb/issues/2138
2940
2971
  [#2140]: https://github.com/DataDog/dd-trace-rb/issues/2140
2941
2972
  [#2150]: https://github.com/DataDog/dd-trace-rb/issues/2150
2973
+ [#2153]: https://github.com/DataDog/dd-trace-rb/issues/2153
2942
2974
  [#2158]: https://github.com/DataDog/dd-trace-rb/issues/2158
2943
2975
  [#2162]: https://github.com/DataDog/dd-trace-rb/issues/2162
2944
2976
  [#2163]: https://github.com/DataDog/dd-trace-rb/issues/2163
2977
+ [#2170]: https://github.com/DataDog/dd-trace-rb/issues/2170
2945
2978
  [#2173]: https://github.com/DataDog/dd-trace-rb/issues/2173
2946
2979
  [#2174]: https://github.com/DataDog/dd-trace-rb/issues/2174
2947
2980
  [#2180]: https://github.com/DataDog/dd-trace-rb/issues/2180
2981
+ [#2200]: https://github.com/DataDog/dd-trace-rb/issues/2200
2948
2982
  [#2201]: https://github.com/DataDog/dd-trace-rb/issues/2201
2983
+ [#2219]: https://github.com/DataDog/dd-trace-rb/issues/2219
2984
+ [#2229]: https://github.com/DataDog/dd-trace-rb/issues/2229
2985
+ [#2250]: https://github.com/DataDog/dd-trace-rb/issues/2250
2986
+ [#2279]: https://github.com/DataDog/dd-trace-rb/issues/2279
2949
2987
  [@AdrianLC]: https://github.com/AdrianLC
2950
2988
  [@Azure7111]: https://github.com/Azure7111
2951
2989
  [@BabyGroot]: https://github.com/BabyGroot
@@ -3087,4 +3125,4 @@ Git diff: https://github.com/DataDog/dd-trace-rb/compare/v0.3.0...v0.3.1
3087
3125
  [@walterking]: https://github.com/walterking
3088
3126
  [@y-yagi]: https://github.com/y-yagi
3089
3127
  [@yukimurasawa]: https://github.com/yukimurasawa
3090
- [@zachmccormick]: https://github.com/zachmccormick
3128
+ [@zachmccormick]: https://github.com/zachmccormick
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- ***Version 1.0.0 has recently been released. Check out our [upgrade guide](https://github.com/DataDog/dd-trace-rb/blob/master/docs/UpgradeGuide.md#from-0x-to-10) for more details.***
1
+ **We've recently released the 1.x version series. If you're upgrading from a 0.x version, check out our [upgrade guide](https://github.com/DataDog/dd-trace-rb/blob/master/docs/UpgradeGuide.md#from-0x-to-10).**
2
2
 
3
3
  # Datadog Trace Client
4
4
 
@@ -85,7 +85,16 @@ static bool failed_to_load(void *handle, VALUE *failure_details) {
85
85
  static bool incompatible_library(void *handle, VALUE *failure_details) {
86
86
  // The library being loaded may be linked to a different libruby than the current executing Ruby.
87
87
  // We check if this is the case by checking if a well-known symbol resolves to a common address.
88
- if (dlsym(handle, "ruby_xmalloc") != &ruby_xmalloc) {
88
+
89
+ void *xmalloc_from_library = dlsym(handle, "ruby_xmalloc");
90
+
91
+ if (xmalloc_from_library == NULL) {
92
+ // This happens when ruby is built without a `libruby.so` by using `--disable-shared` at compilation time.
93
+ // In this situation, no conflict between libruby version is possible.
94
+ return false;
95
+ }
96
+
97
+ if (xmalloc_from_library != &ruby_xmalloc) {
89
98
  *failure_details = rb_str_new_cstr("library was compiled and linked to a different Ruby version");
90
99
  unload_failed_library(handle);
91
100
  return true;
@@ -153,8 +153,6 @@ static VALUE _native_new(VALUE klass) {
153
153
  }
154
154
 
155
155
  static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE recorder_instance, VALUE max_frames) {
156
- enforce_recorder_instance(recorder_instance);
157
-
158
156
  struct cpu_and_wall_time_collector_state *state;
159
157
  TypedData_Get_Struct(collector_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
160
158
 
@@ -164,7 +162,7 @@ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE collector_inst
164
162
  // Update this when modifying state struct
165
163
  state->sampling_buffer = sampling_buffer_new(max_frames_requested);
166
164
  // hash_map_per_thread_context is already initialized, nothing to do here
167
- state->recorder_instance = recorder_instance;
165
+ state->recorder_instance = enforce_recorder_instance(recorder_instance);
168
166
 
169
167
  return Qtrue;
170
168
  }
@@ -180,6 +178,8 @@ static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance
180
178
  //
181
179
  // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
182
180
  // Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
181
+ // Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
182
+ // Assumption 4: This function IS NOT called in a reentrant way.
183
183
  VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
184
184
  struct cpu_and_wall_time_collector_state *state;
185
185
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
@@ -384,6 +384,7 @@ static long thread_id_for(VALUE thread) {
384
384
  return FIXNUM_P(object_id) ? FIX2LONG(object_id) : -1;
385
385
  }
386
386
 
387
- void enforce_cpu_and_wall_time_collector_instance(VALUE object) {
387
+ VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object) {
388
388
  Check_TypedStruct(object, &cpu_and_wall_time_collector_typed_data);
389
+ return object;
389
390
  }
@@ -3,4 +3,4 @@
3
3
  #include <ruby.h>
4
4
 
5
5
  VALUE cpu_and_wall_time_collector_sample(VALUE self_instance);
6
- void enforce_cpu_and_wall_time_collector_instance(VALUE object);
6
+ VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object);
@@ -0,0 +1,391 @@
1
+ #include <ruby.h>
2
+ #include <ruby/thread.h>
3
+ #include <ruby/thread_native.h>
4
+ #include <ruby/debug.h>
5
+ #include <stdbool.h>
6
+ #include <signal.h>
7
+ #include "helpers.h"
8
+ #include "ruby_helpers.h"
9
+ #include "collectors_cpu_and_wall_time.h"
10
+ #include "private_vm_api_access.h"
11
+
12
+ // Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
13
+ // itself; this class only implements the "doing it periodically" part.
14
+ //
15
+ // This file implements the native bits of the Datadog::Profiling::Collectors::CpuAndWallTimeWorker class
16
+
17
+ // ---
18
+ // Here be dragons: This component is quite fiddly and probably one of the more complex in the profiler as it deals with
19
+ // multiple threads, signal handlers, global state, etc.
20
+ //
21
+ // ## Design notes for this class:
22
+ //
23
+ // ### Constraints
24
+ //
25
+ // Currently, sampling Ruby threads requires calling Ruby VM APIs that are only safe to call while holding on to the
26
+ // global VM lock (and are not async-signal safe -- cannot be called from a signal handler).
27
+ //
28
+ // @ivoanjo: As a note, I don't think we should think of this constraint as set in stone. Since can reach into the Ruby
29
+ // internals, we may be able to figure out a way of overcoming it. But it's definitely going to be hard so for now
30
+ // we're considering it as a given.
31
+ //
32
+ // ### Flow for triggering samples
33
+ //
34
+ // The flow for triggering samples is as follows:
35
+ //
36
+ // 1. Inside the `run_sampling_trigger_loop` function (running in the `CpuAndWallTimeWorker` background thread),
37
+ // a `SIGPROF` signal gets sent to the current process.
38
+ //
39
+ // 2. The `handle_sampling_signal` signal handler function gets called to handle the `SIGPROF` signal.
40
+ //
41
+ // Which thread the signal handler function gets called on by the operating system is quite important. We need to perform
42
+ // an operation -- calling the `rb_postponed_job_register_one` API -- that can only be called from the thread that
43
+ // is holding on to the global VM lock. So this is the thread we're "hoping" our signal lands on.
44
+ //
45
+ // The signal never lands on the `CpuAndWallTimeWorker` background thread because we explicitly block it off from that
46
+ // thread in `block_sigprof_signal_handler_from_running_in_current_thread`.
47
+ //
48
+ // If the signal lands on a thread that is not holding onto the global VM lock, we can't proceed to the next step,
49
+ // and we need to restart the sampling flow from step 1. (There's still quite a few improvements we can make here,
50
+ // but this is the current state of the implementation).
51
+ //
52
+ // 3. Inside `handle_sampling_signal`, if it's getting executed by the Ruby thread that is holding the global VM lock,
53
+ // we can call `rb_postponed_job_register_one` to ask the Ruby VM to call our `sample_from_postponed_job` function
54
+ // "as soon as it can".
55
+ //
56
+ // 4. The Ruby VM calls our `sample_from_postponed_job` from a thread holding the global VM lock. A sample is recorded by
57
+ // calling `cpu_and_wall_time_collector_sample`.
58
+ //
59
+ // ---
60
+
61
+ // Contains state for a single CpuAndWallTimeWorker instance
62
+ struct cpu_and_wall_time_worker_state {
63
+ // Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
64
+ // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
65
+ // atomic operations. stdatomic.h seems a nice thing to reach out for.
66
+ volatile bool should_run;
67
+
68
+ VALUE cpu_and_wall_time_collector_instance;
69
+ // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
70
+ // the CpuAndWallTimeWorker thread
71
+ VALUE failure_exception;
72
+ };
73
+
74
+ static VALUE _native_new(VALUE klass);
75
+ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
76
+ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
77
+ static VALUE _native_sampling_loop(VALUE self, VALUE instance);
78
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
79
+ static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
80
+ static void remove_sigprof_signal_handler(void);
81
+ static void block_sigprof_signal_handler_from_running_in_current_thread(void);
82
+ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
83
+ static void *run_sampling_trigger_loop(void *state_ptr);
84
+ static void interrupt_sampling_trigger_loop(void *state_ptr);
85
+ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
86
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
87
+ static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
88
+ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
89
+ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
90
+ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
91
+ static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
92
+ static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
93
+
94
+ // Global state -- be very careful when accessing or modifying it
95
+
96
+ // Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
97
+ // The active_sampler_instance needs to be global because we access it from the signal handler.
98
+ static VALUE active_sampler_instance = Qnil;
99
+ // ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
100
+ // to detect when it is outdated)
101
+ static VALUE active_sampler_owner_thread = Qnil;
102
+
103
+ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
104
+ rb_global_variable(&active_sampler_instance);
105
+ rb_global_variable(&active_sampler_owner_thread);
106
+
107
+ VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
108
+ VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
109
+ // Hosts methods used for testing the native code using RSpec
110
+ VALUE testing_module = rb_define_module_under(collectors_cpu_and_wall_time_worker_class, "Testing");
111
+
112
+ // Instances of the CpuAndWallTimeWorker class are "TypedData" objects.
113
+ // "TypedData" objects are special objects in the Ruby VM that can wrap C structs.
114
+ // In this case, it wraps the cpu_and_wall_time_worker_state.
115
+ //
116
+ // Because Ruby doesn't know how to initialize native-level structs, we MUST override the allocation function for objects
117
+ // of this class so that we can manage this part. Not overriding or disabling the allocation function is a common
118
+ // gotcha for "TypedData" objects that can very easily lead to VM crashes, see for instance
119
+ // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
120
+ rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
121
+
122
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
123
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
124
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
125
+ rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
126
+ rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
127
+ rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
128
+ rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
129
+ }
130
+
131
+ // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
132
+ // See also https://github.com/ruby/ruby/blob/master/doc/extension.rdoc for how this works
133
+ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
134
+ .wrap_struct_name = "Datadog::Profiling::Collectors::CpuAndWallTimeWorker",
135
+ .function = {
136
+ .dmark = cpu_and_wall_time_worker_typed_data_mark,
137
+ .dfree = RUBY_DEFAULT_FREE,
138
+ .dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
139
+ //.dcompact = NULL, // FIXME: Add support for compaction
140
+ },
141
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
142
+ };
143
+
144
+ static VALUE _native_new(VALUE klass) {
145
+ struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
146
+
147
+ state->should_run = false;
148
+ state->cpu_and_wall_time_collector_instance = Qnil;
149
+ state->failure_exception = Qnil;
150
+
151
+ return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
152
+ }
153
+
154
+ static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
155
+ struct cpu_and_wall_time_worker_state *state;
156
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
157
+
158
+ state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
159
+
160
+ return Qtrue;
161
+ }
162
+
163
+ // Since our state contains references to Ruby objects, we need to tell the Ruby GC about them
164
+ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
165
+ struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
166
+
167
+ rb_gc_mark(state->cpu_and_wall_time_collector_instance);
168
+ rb_gc_mark(state->failure_exception);
169
+ }
170
+
171
+ // Called in a background thread created in CpuAndWallTimeWorker#start
172
+ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
173
+ struct cpu_and_wall_time_worker_state *state;
174
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
175
+
176
+ if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
177
+ rb_raise(
178
+ rb_eRuntimeError,
179
+ "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
180
+ );
181
+ }
182
+
183
+ // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
184
+ active_sampler_instance = instance;
185
+ active_sampler_owner_thread = rb_thread_current();
186
+
187
+ state->should_run = true;
188
+
189
+ block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
190
+
191
+ install_sigprof_signal_handler(handle_sampling_signal);
192
+
193
+ // Release GVL, get to the actual work!
194
+ int exception_state;
195
+ rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
196
+
197
+ // The sample trigger loop finished (either cleanly or with an error); let's clean up
198
+
199
+ remove_sigprof_signal_handler();
200
+ active_sampler_instance = Qnil;
201
+ active_sampler_owner_thread = Qnil;
202
+
203
+ // Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
204
+ RB_GC_GUARD(instance);
205
+
206
+ if (exception_state) rb_jump_tag(exception_state); // Re-raise any exception that happened
207
+
208
+ return Qnil;
209
+ }
210
+
211
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
212
+ struct cpu_and_wall_time_worker_state *state;
213
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
214
+
215
+ state->should_run = false;
216
+
217
+ return Qtrue;
218
+ }
219
+
220
+ static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)) {
221
+ struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
222
+ struct sigaction signal_handler_config = {
223
+ .sa_flags = SA_RESTART | SA_SIGINFO,
224
+ .sa_sigaction = signal_handler_function
225
+ };
226
+ sigemptyset(&signal_handler_config.sa_mask);
227
+
228
+ if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) {
229
+ rb_sys_fail("Could not start CpuAndWallTimeWorker: Could not install signal handler");
230
+ }
231
+
232
+ // In some corner cases (e.g. after a fork), our signal handler may still be around, and that's ok
233
+ if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) return;
234
+
235
+ if (existing_signal_handler_config.sa_handler != NULL || existing_signal_handler_config.sa_sigaction != NULL) {
236
+ // A previous signal handler already existed. Currently we don't support this situation, so let's just back out
237
+ // of the installation.
238
+
239
+ if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) {
240
+ rb_sys_fail(
241
+ "Could not start CpuAndWallTimeWorker: Could not re-install pre-existing SIGPROF signal handler. " \
242
+ "This may break the component had installed it."
243
+ );
244
+ }
245
+
246
+ rb_raise(rb_eRuntimeError, "Could not start CpuAndWallTimeWorker: There's a pre-existing SIGPROF signal handler");
247
+ }
248
+ }
249
+
250
+ static void remove_sigprof_signal_handler(void) {
251
+ struct sigaction signal_handler_config = {
252
+ .sa_handler = SIG_DFL, // Reset back to default
253
+ .sa_flags = SA_RESTART // TODO: Unclear if this is actually needed/does anything at all
254
+ };
255
+ sigemptyset(&signal_handler_config.sa_mask);
256
+
257
+ if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
258
+ }
259
+
260
+ static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
261
+ sigset_t signals_to_block;
262
+ sigemptyset(&signals_to_block);
263
+ sigaddset(&signals_to_block, SIGPROF);
264
+ pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
265
+ }
266
+
267
+ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
268
+ if (!ruby_thread_has_gvl_p()) {
269
+ return; // Not safe to enqueue a sample from this thread
270
+ }
271
+
272
+ // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
273
+ // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
274
+ // b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
275
+ // because it will not have the global VM lock
276
+ // TODO: Validate that this does not impact Ractors
277
+
278
+ // Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
279
+ // then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
280
+ // may want to implement that check ourselves.
281
+
282
+ // TODO: Do something with result (potentially update tracking counters?)
283
+ /*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
284
+ }
285
+
286
+ // The actual sampling trigger loop always runs **without** the global vm lock.
287
+ static void *run_sampling_trigger_loop(void *state_ptr) {
288
+ struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
289
+
290
+ struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
291
+
292
+ while (state->should_run) {
293
+ // TODO: This is still a placeholder for a more complex mechanism. In particular:
294
+ // * We want to signal a particular thread or threads, not the process in general
295
+ // * We want to track if a signal landed on the thread holding the global VM lock and do something about it
296
+ // * We want to do more than having a fixed sampling rate
297
+
298
+ kill(getpid(), SIGPROF);
299
+ nanosleep(&time_between_signals, NULL);
300
+ }
301
+
302
+ return NULL; // Unused
303
+ }
304
+
305
+ // This is called by the Ruby VM when it wants to shut down the background thread
306
+ static void interrupt_sampling_trigger_loop(void *state_ptr) {
307
+ struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
308
+
309
+ state->should_run = false;
310
+ }
311
+
312
+ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
313
+ VALUE instance = active_sampler_instance; // Read from global variable
314
+
315
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
316
+ if (instance == Qnil) return;
317
+
318
+ struct cpu_and_wall_time_worker_state *state;
319
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
320
+
321
+ // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
322
+ VALUE (*function_to_call_safely)(VALUE) = cpu_and_wall_time_collector_sample;
323
+ VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
324
+ VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
325
+ VALUE exception_handler_function_arg = instance;
326
+ rb_rescue2(
327
+ function_to_call_safely,
328
+ function_to_call_safely_arg,
329
+ exception_handler_function,
330
+ exception_handler_function_arg,
331
+ rb_eException, // rb_eException is the base class of all Ruby exceptions
332
+ 0 // Required by API to be the last argument
333
+ );
334
+ }
335
+
336
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
337
+ struct cpu_and_wall_time_worker_state *state;
338
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
339
+
340
+ state->should_run = false;
341
+ state->failure_exception = exception;
342
+
343
+ return Qnil;
344
+ }
345
+
346
+ static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
347
+ struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
348
+ if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
349
+ rb_sys_fail("Failed to probe existing handler");
350
+ }
351
+
352
+ if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) {
353
+ return ID2SYM(rb_intern("profiling"));
354
+ } else if (existing_signal_handler_config.sa_sigaction != NULL) {
355
+ return ID2SYM(rb_intern("other"));
356
+ } else {
357
+ return Qnil;
358
+ }
359
+ }
360
+
361
+ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
362
+ struct cpu_and_wall_time_worker_state *state;
363
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
364
+
365
+ rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
366
+
367
+ // If we stopped sampling due to an exception, re-raise it (now in the worker thread)
368
+ if (state->failure_exception != Qnil) rb_exc_raise(state->failure_exception);
369
+
370
+ return Qnil;
371
+ }
372
+
373
+ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
374
+ return \
375
+ (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
376
+ Qtrue : Qfalse;
377
+ }
378
+
379
+ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
380
+ /* Does nothing on purpose */
381
+ }
382
+
383
+ static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
384
+ install_sigprof_signal_handler(testing_signal_handler);
385
+ return Qtrue;
386
+ }
387
+
388
+ static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
389
+ remove_sigprof_signal_handler();
390
+ return Qtrue;
391
+ }
@@ -144,6 +144,8 @@ if RUBY_VERSION < '2.3'
144
144
  $defs << '-DUSE_LEGACY_RB_PROFILE_FRAMES'
145
145
  # ... you couldn't name threads
146
146
  $defs << '-DNO_THREAD_NAMES'
147
+ # ...the ruby_thread_has_gvl_p function was not exposed to users outside of the VM
148
+ $defs << '-DNO_THREAD_HAS_GVL'
147
149
  end
148
150
 
149
151
  # If we got here, libdatadog is available and loaded
@@ -681,3 +681,12 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i
681
681
  }
682
682
 
683
683
  #endif // USE_LEGACY_RB_PROFILE_FRAMES
684
+
685
+ #ifdef NO_THREAD_HAS_GVL
686
+ int ruby_thread_has_gvl_p(void) {
687
+ // TODO: The CpuAndWallTimeWorker needs this function, but Ruby 2.2 doesn't expose it... For now this placeholder
688
+ // will enable the profiling native extension to continue to compile on Ruby 2.2, but the CpuAndWallTimeWorker will
689
+ // not work properly on 2.2. Will be addressed later.
690
+ return 0;
691
+ }
692
+ #endif // NO_THREAD_HAS_GVL
@@ -5,6 +5,7 @@
5
5
 
6
6
  // Each class/module here is implemented in their separate file
7
7
  void collectors_cpu_and_wall_time_init(VALUE profiling_module);
8
+ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module);
8
9
  void collectors_stack_init(VALUE profiling_module);
9
10
  void http_transport_init(VALUE profiling_module);
10
11
  void stack_recorder_init(VALUE profiling_module);
@@ -22,6 +23,7 @@ void DDTRACE_EXPORT Init_ddtrace_profiling_native_extension(void) {
22
23
  rb_define_singleton_method(native_extension_module, "clock_id_for", clock_id_for, 1); // from clock_id.h
23
24
 
24
25
  collectors_cpu_and_wall_time_init(profiling_module);
26
+ collectors_cpu_and_wall_time_worker_init(profiling_module);
25
27
  collectors_stack_init(profiling_module);
26
28
  http_transport_init(profiling_module);
27
29
  stack_recorder_init(profiling_module);
@@ -324,8 +324,9 @@ static void *call_serialize_without_gvl(void *call_args) {
324
324
  return NULL; // Unused
325
325
  }
326
326
 
327
- void enforce_recorder_instance(VALUE object) {
327
+ VALUE enforce_recorder_instance(VALUE object) {
328
328
  Check_TypedStruct(object, &stack_recorder_typed_data);
329
+ return object;
329
330
  }
330
331
 
331
332
  static struct active_slot_pair sampler_lock_active_profile(struct stack_recorder_state *state) {
@@ -15,12 +15,13 @@
15
15
  // ```
16
16
  #define VALUE_STRING(string) {.ptr = "" string, .len = sizeof(string) - 1}
17
17
 
18
- #define CPU_TIME_VALUE {.type_ = VALUE_STRING("cpu-time"), .unit = VALUE_STRING("nanoseconds")}
19
- #define CPU_SAMPLES_VALUE {.type_ = VALUE_STRING("cpu-samples"), .unit = VALUE_STRING("count")}
20
- #define WALL_TIME_VALUE {.type_ = VALUE_STRING("wall-time"), .unit = VALUE_STRING("nanoseconds")}
21
- #define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")}
22
- #define ALLOC_SPACE_VALUE {.type_ = VALUE_STRING("alloc-space"), .unit = VALUE_STRING("bytes")}
23
- #define HEAP_SPACE_VALUE {.type_ = VALUE_STRING("heap-space"), .unit = VALUE_STRING("bytes")}
18
+ #define CPU_TIME_VALUE {.type_ = VALUE_STRING("cpu-time"), .unit = VALUE_STRING("nanoseconds")}
19
+ #define CPU_SAMPLES_VALUE {.type_ = VALUE_STRING("cpu-samples"), .unit = VALUE_STRING("count")}
20
+ #define WALL_TIME_VALUE {.type_ = VALUE_STRING("wall-time"), .unit = VALUE_STRING("nanoseconds")}
21
+ #define ALLOC_SIZE_VALUE {.type_ = VALUE_STRING("alloc-size"), .unit = VALUE_STRING("bytes")}
22
+ #define ALLOC_SAMPLES_VALUE {.type_ = VALUE_STRING("alloc-samples"), .unit = VALUE_STRING("count")}
23
+ #define HEAP_LIVE_SIZE_VALUE {.type_ = VALUE_STRING("heap-live-size"), .unit = VALUE_STRING("bytes")}
24
+ #define HEAP_LIVE_SAMPLES_VALUE {.type_ = VALUE_STRING("heap-live-samples"), .unit = VALUE_STRING("count")}
24
25
 
25
26
  static const ddprof_ffi_ValueType enabled_value_types[] = {
26
27
  #define CPU_TIME_VALUE_POS 0
@@ -34,4 +35,4 @@ static const ddprof_ffi_ValueType enabled_value_types[] = {
34
35
  #define ENABLED_VALUE_TYPES_COUNT (sizeof(enabled_value_types) / sizeof(ddprof_ffi_ValueType))
35
36
 
36
37
  void record_sample(VALUE recorder_instance, ddprof_ffi_Sample sample);
37
- void enforce_recorder_instance(VALUE object);
38
+ VALUE enforce_recorder_instance(VALUE object);
@@ -35,7 +35,7 @@ module Datadog
35
35
  false
36
36
  end
37
37
 
38
- def default_configuration
38
+ def new_configuration
39
39
  Configuration::Settings.new
40
40
  end
41
41
 
@@ -36,7 +36,7 @@ module Datadog
36
36
  false
37
37
  end
38
38
 
39
- def default_configuration
39
+ def new_configuration
40
40
  Configuration::Settings.new
41
41
  end
42
42