ddtrace 1.6.1 → 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (171) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +89 -2
  3. data/README.md +2 -2
  4. data/ext/ddtrace_profiling_loader/extconf.rb +5 -2
  5. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +1 -1
  6. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +3 -2
  7. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +81 -47
  8. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
  9. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +332 -125
  10. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c +142 -0
  11. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h +14 -0
  12. data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.c +241 -0
  13. data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.h +3 -0
  14. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +11 -13
  15. data/ext/ddtrace_profiling_native_extension/extconf.rb +22 -8
  16. data/ext/ddtrace_profiling_native_extension/helpers.h +5 -0
  17. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +8 -0
  18. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +111 -26
  19. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +9 -0
  20. data/ext/ddtrace_profiling_native_extension/profiling.c +205 -0
  21. data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +86 -0
  22. data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +28 -6
  23. data/ext/ddtrace_profiling_native_extension/setup_signal_handler.c +115 -0
  24. data/ext/ddtrace_profiling_native_extension/setup_signal_handler.h +11 -0
  25. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +84 -35
  26. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +1 -0
  27. data/ext/ddtrace_profiling_native_extension/time_helpers.c +17 -0
  28. data/ext/ddtrace_profiling_native_extension/time_helpers.h +10 -0
  29. data/lib/datadog/appsec/assets/blocked.html +98 -3
  30. data/lib/datadog/appsec/assets/blocked.json +1 -0
  31. data/lib/datadog/appsec/assets/blocked.text +5 -0
  32. data/lib/datadog/appsec/assets/waf_rules/recommended.json +35 -46
  33. data/lib/datadog/appsec/assets/waf_rules/risky.json +1 -1
  34. data/lib/datadog/appsec/assets/waf_rules/strict.json +46 -1
  35. data/lib/datadog/appsec/assets.rb +2 -2
  36. data/lib/datadog/appsec/configuration/settings.rb +6 -0
  37. data/lib/datadog/appsec/configuration.rb +4 -0
  38. data/lib/datadog/appsec/contrib/rack/reactive/request.rb +4 -8
  39. data/lib/datadog/appsec/contrib/rack/request.rb +17 -0
  40. data/lib/datadog/appsec/contrib/rack/request_body_middleware.rb +2 -2
  41. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +2 -2
  42. data/lib/datadog/appsec/contrib/rails/patcher.rb +3 -6
  43. data/lib/datadog/appsec/contrib/sinatra/ext.rb +1 -0
  44. data/lib/datadog/appsec/contrib/sinatra/gateway/watcher.rb +1 -1
  45. data/lib/datadog/appsec/contrib/sinatra/patcher.rb +11 -8
  46. data/lib/datadog/appsec/extensions.rb +10 -0
  47. data/lib/datadog/appsec/processor.rb +18 -0
  48. data/lib/datadog/appsec/response.rb +54 -0
  49. data/lib/datadog/core/configuration/components.rb +27 -6
  50. data/lib/datadog/core/configuration/ext.rb +18 -0
  51. data/lib/datadog/core/configuration/settings.rb +14 -341
  52. data/lib/datadog/core/diagnostics/health.rb +4 -22
  53. data/lib/datadog/core/environment/variable_helpers.rb +58 -10
  54. data/lib/datadog/core/runtime/ext.rb +1 -1
  55. data/lib/datadog/core/utils.rb +0 -21
  56. data/lib/datadog/core.rb +21 -1
  57. data/lib/datadog/opentracer/distributed_headers.rb +7 -9
  58. data/lib/datadog/opentracer/rack_propagator.rb +0 -3
  59. data/lib/datadog/opentracer/text_map_propagator.rb +5 -7
  60. data/lib/datadog/profiling/collectors/cpu_and_wall_time.rb +10 -4
  61. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +20 -5
  62. data/lib/datadog/profiling/collectors/dynamic_sampling_rate.rb +14 -0
  63. data/lib/datadog/profiling/collectors/idle_sampling_helper.rb +68 -0
  64. data/lib/datadog/profiling/collectors/old_stack.rb +7 -0
  65. data/lib/datadog/profiling/exporter.rb +5 -0
  66. data/lib/datadog/profiling/old_recorder.rb +8 -0
  67. data/lib/datadog/profiling/profiler.rb +7 -0
  68. data/lib/datadog/profiling/scheduler.rb +4 -7
  69. data/lib/datadog/profiling/stack_recorder.rb +36 -0
  70. data/lib/datadog/profiling/tasks/setup.rb +0 -7
  71. data/lib/datadog/profiling.rb +2 -0
  72. data/lib/datadog/tracing/configuration/ext.rb +33 -3
  73. data/lib/datadog/tracing/configuration/settings.rb +433 -0
  74. data/lib/datadog/tracing/contrib/aws/configuration/settings.rb +4 -1
  75. data/lib/datadog/tracing/contrib/aws/ext.rb +1 -0
  76. data/lib/datadog/tracing/contrib/dalli/configuration/settings.rb +4 -1
  77. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  78. data/lib/datadog/tracing/contrib/delayed_job/plugin.rb +4 -0
  79. data/lib/datadog/tracing/contrib/elasticsearch/configuration/settings.rb +5 -1
  80. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +1 -0
  81. data/lib/datadog/tracing/contrib/ethon/configuration/settings.rb +6 -1
  82. data/lib/datadog/tracing/contrib/ethon/ext.rb +1 -0
  83. data/lib/datadog/tracing/contrib/excon/configuration/settings.rb +5 -1
  84. data/lib/datadog/tracing/contrib/excon/ext.rb +1 -0
  85. data/lib/datadog/tracing/contrib/faraday/configuration/settings.rb +5 -1
  86. data/lib/datadog/tracing/contrib/faraday/ext.rb +1 -0
  87. data/lib/datadog/tracing/contrib/grpc/configuration/settings.rb +6 -1
  88. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +2 -1
  89. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/server.rb +6 -12
  90. data/lib/datadog/tracing/contrib/grpc/distributed/fetcher.rb +27 -0
  91. data/lib/datadog/tracing/contrib/grpc/distributed/propagation.rb +43 -0
  92. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  93. data/lib/datadog/tracing/contrib/grpc/patcher.rb +0 -2
  94. data/lib/datadog/tracing/contrib/http/configuration/settings.rb +6 -1
  95. data/lib/datadog/tracing/contrib/http/distributed/fetcher.rb +32 -0
  96. data/lib/datadog/tracing/contrib/http/distributed/propagation.rb +38 -0
  97. data/lib/datadog/tracing/contrib/http/ext.rb +1 -0
  98. data/lib/datadog/tracing/contrib/httpclient/configuration/settings.rb +6 -1
  99. data/lib/datadog/tracing/contrib/httpclient/ext.rb +1 -0
  100. data/lib/datadog/tracing/contrib/httprb/configuration/settings.rb +6 -1
  101. data/lib/datadog/tracing/contrib/httprb/ext.rb +1 -0
  102. data/lib/datadog/tracing/contrib/kafka/consumer_event.rb +1 -0
  103. data/lib/datadog/tracing/contrib/kafka/events/produce_operation/send_messages.rb +1 -0
  104. data/lib/datadog/tracing/contrib/kafka/events/producer/deliver_messages.rb +1 -0
  105. data/lib/datadog/tracing/contrib/mongodb/configuration/settings.rb +5 -1
  106. data/lib/datadog/tracing/contrib/mongodb/ext.rb +1 -0
  107. data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +2 -0
  108. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +4 -1
  109. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  110. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +2 -2
  111. data/lib/datadog/tracing/contrib/patcher.rb +3 -2
  112. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +4 -1
  113. data/lib/datadog/tracing/contrib/pg/ext.rb +1 -0
  114. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +12 -2
  115. data/lib/datadog/tracing/contrib/presto/configuration/settings.rb +4 -1
  116. data/lib/datadog/tracing/contrib/presto/ext.rb +1 -0
  117. data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +1 -0
  118. data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +10 -12
  119. data/lib/datadog/tracing/contrib/que/tracer.rb +2 -0
  120. data/lib/datadog/tracing/contrib/racecar/events/batch.rb +4 -1
  121. data/lib/datadog/tracing/contrib/racecar/events/message.rb +4 -1
  122. data/lib/datadog/tracing/contrib/rack/middlewares.rb +2 -0
  123. data/lib/datadog/tracing/contrib/redis/configuration/settings.rb +4 -1
  124. data/lib/datadog/tracing/contrib/redis/ext.rb +1 -0
  125. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +30 -21
  126. data/lib/datadog/tracing/contrib/redis/integration.rb +34 -2
  127. data/lib/datadog/tracing/contrib/redis/patcher.rb +18 -14
  128. data/lib/datadog/tracing/contrib/redis/quantize.rb +12 -9
  129. data/lib/datadog/tracing/contrib/redis/tags.rb +4 -6
  130. data/lib/datadog/tracing/contrib/redis/trace_middleware.rb +72 -0
  131. data/lib/datadog/tracing/contrib/resque/resque_job.rb +2 -0
  132. data/lib/datadog/tracing/contrib/rest_client/configuration/settings.rb +6 -1
  133. data/lib/datadog/tracing/contrib/rest_client/ext.rb +1 -0
  134. data/lib/datadog/tracing/contrib/shoryuken/tracer.rb +2 -0
  135. data/lib/datadog/tracing/contrib/sidekiq/client_tracer.rb +5 -0
  136. data/lib/datadog/tracing/contrib/sidekiq/server_tracer.rb +5 -0
  137. data/lib/datadog/tracing/contrib/sneakers/tracer.rb +2 -0
  138. data/lib/datadog/{core → tracing}/diagnostics/ext.rb +1 -6
  139. data/lib/datadog/tracing/diagnostics/health.rb +40 -0
  140. data/lib/datadog/tracing/distributed/b3_multi.rb +66 -0
  141. data/lib/datadog/tracing/distributed/b3_single.rb +66 -0
  142. data/lib/datadog/tracing/distributed/datadog.rb +153 -0
  143. data/lib/datadog/tracing/distributed/datadog_tags_codec.rb +1 -0
  144. data/lib/datadog/tracing/distributed/fetcher.rb +30 -0
  145. data/lib/datadog/tracing/distributed/headers/ext.rb +18 -16
  146. data/lib/datadog/tracing/distributed/helpers.rb +9 -7
  147. data/lib/datadog/tracing/distributed/none.rb +19 -0
  148. data/lib/datadog/tracing/distributed/propagation.rb +127 -0
  149. data/lib/datadog/tracing/distributed/trace_context.rb +369 -0
  150. data/lib/datadog/tracing/metadata/ext.rb +1 -1
  151. data/lib/datadog/tracing/propagation/http.rb +3 -106
  152. data/lib/datadog/tracing/sampling/priority_sampler.rb +11 -0
  153. data/lib/datadog/tracing/sampling/rate_sampler.rb +3 -3
  154. data/lib/datadog/tracing/span.rb +3 -19
  155. data/lib/datadog/tracing/span_operation.rb +5 -4
  156. data/lib/datadog/tracing/trace_digest.rb +75 -2
  157. data/lib/datadog/tracing/trace_operation.rb +5 -4
  158. data/lib/datadog/tracing/trace_segment.rb +1 -1
  159. data/lib/datadog/tracing/utils.rb +50 -0
  160. data/lib/ddtrace/transport/trace_formatter.rb +2 -5
  161. data/lib/ddtrace/version.rb +2 -2
  162. metadata +35 -15
  163. data/lib/datadog/tracing/distributed/headers/b3.rb +0 -55
  164. data/lib/datadog/tracing/distributed/headers/b3_single.rb +0 -67
  165. data/lib/datadog/tracing/distributed/headers/datadog.rb +0 -144
  166. data/lib/datadog/tracing/distributed/headers/parser.rb +0 -37
  167. data/lib/datadog/tracing/distributed/metadata/b3.rb +0 -55
  168. data/lib/datadog/tracing/distributed/metadata/b3_single.rb +0 -66
  169. data/lib/datadog/tracing/distributed/metadata/datadog.rb +0 -73
  170. data/lib/datadog/tracing/distributed/metadata/parser.rb +0 -34
  171. data/lib/datadog/tracing/propagation/grpc.rb +0 -98
@@ -3,11 +3,18 @@
3
3
  #include <ruby/thread_native.h>
4
4
  #include <ruby/debug.h>
5
5
  #include <stdbool.h>
6
+ #include <stdatomic.h>
6
7
  #include <signal.h>
8
+ #include <errno.h>
9
+
7
10
  #include "helpers.h"
8
11
  #include "ruby_helpers.h"
9
12
  #include "collectors_cpu_and_wall_time.h"
13
+ #include "collectors_dynamic_sampling_rate.h"
14
+ #include "collectors_idle_sampling_helper.h"
10
15
  #include "private_vm_api_access.h"
16
+ #include "setup_signal_handler.h"
17
+ #include "time_helpers.h"
11
18
 
12
19
  // Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
13
20
  // itself; this class only implements the "doing it periodically" part.
@@ -29,7 +36,7 @@
29
36
  // internals, we may be able to figure out a way of overcoming it. But it's definitely going to be hard so for now
30
37
  // we're considering it as a given.
31
38
  //
32
- // ### Flow for triggering samples
39
+ // ### Flow for triggering CPU/Wall-time samples
33
40
  //
34
41
  // The flow for triggering samples is as follows:
35
42
  //
@@ -56,23 +63,56 @@
56
63
  // 4. The Ruby VM calls our `sample_from_postponed_job` from a thread holding the global VM lock. A sample is recorded by
57
64
  // calling `cpu_and_wall_time_collector_sample`.
58
65
  //
66
+ // ### TracePoints and Forking
67
+ //
68
+ // When the Ruby VM forks, the CPU/Wall-time profiling stops naturally because it's triggered by a background thread
69
+ // that doesn't get automatically restarted by the VM on the child process. (The profiler does trigger its restart at
70
+ // some point -- see `Profiling::Tasks::Setup` for details).
71
+ //
72
+ // But this doesn't apply to any `TracePoint`s this class may use, which will continue to be active. Thus, we need to
73
+ // always remember consider this case of -- the worker thread may not be alive but the `TracePoint`s can continue to
74
+ // trigger samples.
75
+ //
59
76
  // ---
60
77
 
61
78
  // Contains state for a single CpuAndWallTimeWorker instance
62
79
  struct cpu_and_wall_time_worker_state {
63
- // Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
64
- // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
65
- // atomic operations. stdatomic.h seems a nice thing to reach out for.
66
- volatile bool should_run;
80
+ atomic_bool should_run;
81
+
67
82
  bool gc_profiling_enabled;
83
+ VALUE self_instance;
68
84
  VALUE cpu_and_wall_time_collector_instance;
85
+ VALUE idle_sampling_helper_instance;
86
+ VALUE owner_thread;
87
+ dynamic_sampling_rate_state dynamic_sampling_rate;
69
88
 
70
89
  // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
71
90
  // the CpuAndWallTimeWorker thread
72
91
  VALUE failure_exception;
92
+ // Used by `_native_stop` to flag the worker thread to start (see comment on `_native_sampling_loop`)
93
+ VALUE stop_thread;
73
94
 
74
95
  // Used to get gc start/finish information
75
96
  VALUE gc_tracepoint;
97
+
98
+ struct stats {
99
+ // How many times we tried to trigger a sample
100
+ unsigned int trigger_sample_attempts;
101
+ // How many times we tried to simulate signal delivery
102
+ unsigned int trigger_simulated_signal_delivery_attempts;
103
+ // How many times we actually simulated signal delivery
104
+ unsigned int simulated_signal_delivery;
105
+ // How many times we actually called rb_postponed_job_register_one from a signal handler
106
+ unsigned int signal_handler_enqueued_sample;
107
+ // How many times the signal handler was called from the wrong thread
108
+ unsigned int signal_handler_wrong_thread;
109
+ // How many times we actually sampled (except GC samples)
110
+ unsigned int sampled;
111
+ // Min/max/total wall-time spent sampling (except GC samples)
112
+ uint64_t sampling_time_ns_min;
113
+ uint64_t sampling_time_ns_max;
114
+ uint64_t sampling_time_ns_total;
115
+ } stats;
76
116
  };
77
117
 
78
118
  static VALUE _native_new(VALUE klass);
@@ -80,19 +120,18 @@ static VALUE _native_initialize(
80
120
  DDTRACE_UNUSED VALUE _self,
81
121
  VALUE self_instance,
82
122
  VALUE cpu_and_wall_time_collector_instance,
83
- VALUE gc_profiling_enabled
123
+ VALUE gc_profiling_enabled,
124
+ VALUE idle_sampling_helper_instance
84
125
  );
85
126
  static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
86
127
  static VALUE _native_sampling_loop(VALUE self, VALUE instance);
87
- static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
128
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread);
88
129
  static VALUE stop(VALUE self_instance, VALUE optional_exception);
89
- static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
90
- static void remove_sigprof_signal_handler(void);
91
- static void block_sigprof_signal_handler_from_running_in_current_thread(void);
92
130
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
93
131
  static void *run_sampling_trigger_loop(void *state_ptr);
94
132
  static void interrupt_sampling_trigger_loop(void *state_ptr);
95
133
  static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
134
+ static VALUE rescued_sample_from_postponed_job(VALUE self_instance);
96
135
  static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
97
136
  static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
98
137
  static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
@@ -104,22 +143,30 @@ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
104
143
  static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
105
144
  static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
106
145
  static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
107
- static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
146
+ static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
108
147
  static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
109
148
  static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
110
-
111
- // Global state -- be very careful when accessing or modifying it
112
-
113
- // Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
114
- // The active_sampler_instance needs to be global because we access it from the signal handler.
149
+ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance);
150
+ static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self);
151
+ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance);
152
+ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused);
153
+ static void grab_gvl_and_sample(void);
154
+ static void reset_stats(struct cpu_and_wall_time_worker_state *state);
155
+ static void sleep_for(uint64_t time_ns);
156
+
157
+ // Note on sampler global state safety:
158
+ //
159
+ // Both `active_sampler_instance` and `active_sampler_instance_state` are **GLOBAL** state. Be careful when accessing
160
+ // or modifying them.
161
+ // In particular, it's important to only mutate them while holding the global VM lock, to ensure correctness.
162
+ //
163
+ // This global state is needed because a bunch of functions on this file need to access it from situations
164
+ // (e.g. signal handler) where it's impossible or just awkward to pass it as an argument.
115
165
  static VALUE active_sampler_instance = Qnil;
116
- // ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
117
- // to detect when it is outdated)
118
- static VALUE active_sampler_owner_thread = Qnil;
166
+ struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;
119
167
 
120
168
  void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
121
169
  rb_global_variable(&active_sampler_instance);
122
- rb_global_variable(&active_sampler_owner_thread);
123
170
 
124
171
  VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
125
172
  VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
@@ -136,9 +183,11 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
136
183
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
137
184
  rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
138
185
 
139
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
186
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 4);
140
187
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
141
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
188
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
189
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
190
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stats", _native_stats, 1);
142
191
  rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
143
192
  rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
144
193
  rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
@@ -147,6 +196,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
147
196
  rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
148
197
  rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
149
198
  rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
199
+ rb_define_singleton_method(testing_module, "_native_is_sigprof_blocked_in_current_thread", _native_is_sigprof_blocked_in_current_thread, 0);
150
200
  }
151
201
 
152
202
  // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
@@ -156,7 +206,7 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
156
206
  .function = {
157
207
  .dmark = cpu_and_wall_time_worker_typed_data_mark,
158
208
  .dfree = RUBY_DEFAULT_FREE,
159
- .dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
209
+ .dsize = NULL, // We don't track memory usage (although it'd be cool if we did!)
160
210
  //.dcompact = NULL, // FIXME: Add support for compaction
161
211
  },
162
212
  .flags = RUBY_TYPED_FREE_IMMEDIATELY
@@ -165,20 +215,26 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
165
215
  static VALUE _native_new(VALUE klass) {
166
216
  struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
167
217
 
168
- state->should_run = false;
218
+ atomic_init(&state->should_run, false);
169
219
  state->gc_profiling_enabled = false;
170
220
  state->cpu_and_wall_time_collector_instance = Qnil;
221
+ state->idle_sampling_helper_instance = Qnil;
222
+ state->owner_thread = Qnil;
223
+ dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
171
224
  state->failure_exception = Qnil;
225
+ state->stop_thread = Qnil;
172
226
  state->gc_tracepoint = Qnil;
227
+ reset_stats(state);
173
228
 
174
- return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
229
+ return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
175
230
  }
176
231
 
177
232
  static VALUE _native_initialize(
178
233
  DDTRACE_UNUSED VALUE _self,
179
234
  VALUE self_instance,
180
235
  VALUE cpu_and_wall_time_collector_instance,
181
- VALUE gc_profiling_enabled
236
+ VALUE gc_profiling_enabled,
237
+ VALUE idle_sampling_helper_instance
182
238
  ) {
183
239
  ENFORCE_BOOLEAN(gc_profiling_enabled);
184
240
 
@@ -187,6 +243,7 @@ static VALUE _native_initialize(
187
243
 
188
244
  state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
189
245
  state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
246
+ state->idle_sampling_helper_instance = idle_sampling_helper_instance;
190
247
  state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
191
248
 
192
249
  return Qtrue;
@@ -197,7 +254,10 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
197
254
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
198
255
 
199
256
  rb_gc_mark(state->cpu_and_wall_time_collector_instance);
257
+ rb_gc_mark(state->idle_sampling_helper_instance);
258
+ rb_gc_mark(state->owner_thread);
200
259
  rb_gc_mark(state->failure_exception);
260
+ rb_gc_mark(state->stop_thread);
201
261
  rb_gc_mark(state->gc_tracepoint);
202
262
  }
203
263
 
@@ -206,8 +266,9 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
206
266
  struct cpu_and_wall_time_worker_state *state;
207
267
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
208
268
 
209
- if (active_sampler_owner_thread != Qnil) {
210
- if (is_thread_alive(active_sampler_owner_thread)) {
269
+ struct cpu_and_wall_time_worker_state *old_state = active_sampler_instance_state;
270
+ if (old_state != NULL) {
271
+ if (is_thread_alive(old_state->owner_thread)) {
211
272
  rb_raise(
212
273
  rb_eRuntimeError,
213
274
  "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
@@ -221,23 +282,26 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
221
282
  // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
222
283
  // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
223
284
 
224
- struct cpu_and_wall_time_worker_state *old_state;
225
- TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
226
285
  rb_tracepoint_disable(old_state->gc_tracepoint);
227
286
  }
228
287
  }
229
288
 
289
+ // We use `stop_thread` to distinguish when `_native_stop` was called before we actually had a chance to start. In this
290
+ // situation we stop immediately and never even start the sampling trigger loop.
291
+ if (state->stop_thread == rb_thread_current()) return Qnil;
292
+
293
+ // Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
294
+ dynamic_sampling_rate_reset(&state->dynamic_sampling_rate);
295
+
230
296
  // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
297
+ active_sampler_instance_state = state;
231
298
  active_sampler_instance = instance;
232
- active_sampler_owner_thread = rb_thread_current();
299
+ state->owner_thread = rb_thread_current();
233
300
 
234
- state->should_run = true;
301
+ atomic_store(&state->should_run, true);
235
302
 
236
303
  block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
237
304
 
238
- install_sigprof_signal_handler(handle_sampling_signal);
239
- if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
240
-
241
305
  // Release GVL, get to the actual work!
242
306
  int exception_state;
243
307
  rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
@@ -245,9 +309,32 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
245
309
  // The sample trigger loop finished (either cleanly or with an error); let's clean up
246
310
 
247
311
  rb_tracepoint_disable(state->gc_tracepoint);
248
- remove_sigprof_signal_handler();
312
+
313
+ active_sampler_instance_state = NULL;
249
314
  active_sampler_instance = Qnil;
250
- active_sampler_owner_thread = Qnil;
315
+ state->owner_thread = Qnil;
316
+
317
+ // If this `Thread` is about to die, why is this important? It's because Ruby caches native threads for a period after
318
+ // the `Thread` dies, and reuses them if a new Ruby `Thread` gets created. This means that while conceptually the
319
+ // worker background `Thread` is about to die, the low-level native OS thread can be reused for something else in the Ruby app.
320
+ // Then, the reused thread would "inherit" the SIGPROF blocking, which is... really unexpected.
321
+ // This actually caused a flaky test -- the `native_extension_spec.rb` creates a `Thread` and tries to specifically
322
+ // send SIGPROF signals to it, and oops it could fail if it got the reused native thread from the worker which still
323
+ // had SIGPROF delivery blocked. :hide_the_pain_harold:
324
+ unblock_sigprof_signal_handler_from_running_in_current_thread();
325
+
326
+ // Why replace and not use remove the signal handler? We do this because when a process receives a SIGPROF without
327
+ // having an explicit signal handler set up, the process will instantly terminate with a confusing
328
+ // "Profiling timer expired" message left behind. (This message doesn't come from us -- it's the default message for
329
+ // an unhandled SIGPROF. Pretty confusing UNIX/POSIX behavior...)
330
+ //
331
+ // Unfortunately, because signal delivery is asynchronous, there's no way to guarantee that there are no pending
332
+ // profiler-sent signals by the time we get here and want to clean up.
333
+ // @ivoanjo: I suspect this will never happen, but the cost of getting it wrong is really high (VM terminates) so this
334
+ // is a just-in-case situation.
335
+ //
336
+ // Note 2: This can raise exceptions as well, so make sure that all cleanups are done by the time we get here.
337
+ replace_sigprof_signal_handler_with_empty_handler(handle_sampling_signal);
251
338
 
252
339
  // Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
253
340
  RB_GC_GUARD(instance);
@@ -257,7 +344,12 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
257
344
  return Qnil;
258
345
  }
259
346
 
260
- static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
347
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread) {
348
+ struct cpu_and_wall_time_worker_state *state;
349
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
350
+
351
+ state->stop_thread = worker_thread;
352
+
261
353
  return stop(self_instance, /* optional_exception: */ Qnil);
262
354
  }
263
355
 
@@ -265,7 +357,7 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
265
357
  struct cpu_and_wall_time_worker_state *state;
266
358
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
267
359
 
268
- state->should_run = false;
360
+ atomic_store(&state->should_run, false);
269
361
  state->failure_exception = optional_exception;
270
362
 
271
363
  // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
@@ -274,92 +366,80 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
274
366
  return Qtrue;
275
367
  }
276
368
 
277
- static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)) {
278
- struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
279
- struct sigaction signal_handler_config = {
280
- .sa_flags = SA_RESTART | SA_SIGINFO,
281
- .sa_sigaction = signal_handler_function
282
- };
283
- sigemptyset(&signal_handler_config.sa_mask);
284
-
285
- if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) {
286
- rb_sys_fail("Could not start CpuAndWallTimeWorker: Could not install signal handler");
287
- }
288
-
289
- // In some corner cases (e.g. after a fork), our signal handler may still be around, and that's ok
290
- if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) return;
291
-
292
- if (existing_signal_handler_config.sa_handler != NULL || existing_signal_handler_config.sa_sigaction != NULL) {
293
- // A previous signal handler already existed. Currently we don't support this situation, so let's just back out
294
- // of the installation.
295
-
296
- if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) {
297
- rb_sys_fail(
298
- "Could not start CpuAndWallTimeWorker: Could not re-install pre-existing SIGPROF signal handler. " \
299
- "This may break the component had installed it."
300
- );
301
- }
302
-
303
- rb_raise(rb_eRuntimeError, "Could not start CpuAndWallTimeWorker: There's a pre-existing SIGPROF signal handler");
304
- }
305
- }
306
-
307
- static void remove_sigprof_signal_handler(void) {
308
- struct sigaction signal_handler_config = {
309
- .sa_handler = SIG_DFL, // Reset back to default
310
- .sa_flags = SA_RESTART // TODO: Unclear if this is actually needed/does anything at all
311
- };
312
- sigemptyset(&signal_handler_config.sa_mask);
313
-
314
- if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
315
- }
316
-
317
- static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
318
- sigset_t signals_to_block;
319
- sigemptyset(&signals_to_block);
320
- sigaddset(&signals_to_block, SIGPROF);
321
- pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
322
- }
323
-
324
369
  // NOTE: Remember that this will run in the thread and within the scope of user code, including user C code.
325
370
  // We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
326
371
  // we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
327
372
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
328
- if (!ruby_thread_has_gvl_p()) {
329
- return; // Not safe to enqueue a sample from this thread
330
- }
331
- if (!ddtrace_rb_ractor_main_p()) {
332
- return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
373
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
374
+
375
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the signal delivery was happening; nothing to do
376
+ if (state == NULL) return;
377
+
378
+ if (
379
+ !ruby_native_thread_p() || // Not a Ruby thread
380
+ !is_current_thread_holding_the_gvl() || // Not safe to enqueue a sample from this thread
381
+ !ddtrace_rb_ractor_main_p() // We're not on the main Ractor; we currently don't support profiling non-main Ractors
382
+ ) {
383
+ state->stats.signal_handler_wrong_thread++;
384
+ return;
333
385
  }
334
386
 
335
387
  // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
336
388
  // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
337
389
  // b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
338
390
  // because it will not have the global VM lock
339
- // TODO: Validate that this does not impact Ractors
340
391
 
341
392
  // Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
342
393
  // then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
343
394
  // may want to implement that check ourselves.
344
395
 
345
- // TODO: Do something with result (potentially update tracking counters?)
396
+ state->stats.signal_handler_enqueued_sample++;
397
+
398
+ // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
399
+ // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
346
400
  /*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
401
+ // TODO: Do something with result (potentially update tracking counters?)
347
402
  }
348
403
 
349
404
  // The actual sampling trigger loop always runs **without** the global vm lock.
350
405
  static void *run_sampling_trigger_loop(void *state_ptr) {
351
406
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
352
407
 
353
- struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
408
+ uint64_t minimum_time_between_signals = MILLIS_AS_NS(10);
409
+
410
+ while (atomic_load(&state->should_run)) {
411
+ state->stats.trigger_sample_attempts++;
354
412
 
355
- while (state->should_run) {
356
413
  // TODO: This is still a placeholder for a more complex mechanism. In particular:
357
- // * We want to signal a particular thread or threads, not the process in general
358
- // * We want to track if a signal landed on the thread holding the global VM lock and do something about it
359
414
  // * We want to do more than having a fixed sampling rate
360
415
 
361
- kill(getpid(), SIGPROF);
362
- nanosleep(&time_between_signals, NULL);
416
+ current_gvl_owner owner = gvl_owner();
417
+ if (owner.valid) {
418
+ // Note that reading the GVL owner and sending them a signal is a race -- the Ruby VM keeps on executing while
419
+ // we're doing this, so we may still not signal the correct thread from time to time, but our signal handler
420
+ // includes a check to see if it got called in the right thread
421
+ pthread_kill(owner.owner, SIGPROF);
422
+ } else {
423
+ // If no thread owns the Global VM Lock, the application is probably idle at the moment. We still want to sample
424
+ // so we "ask a friend" (the IdleSamplingHelper component) to grab the GVL and simulate getting a SIGPROF.
425
+ //
426
+ // In a previous version of the code, we called `grab_gvl_and_sample` directly BUT this was problematic because
427
+ // Ruby may concurrently get busy and so the CpuAndWallTimeWorker would be blocked in line to acquire the GVL
428
+ // for an uncontrolled amount of time. (This can still happen to the IdleSamplingHelper, but the
429
+ // CpuAndWallTimeWorker will still be free to interrupt the Ruby VM and keep sampling for the entire blocking period).
430
+ state->stats.trigger_simulated_signal_delivery_attempts++;
431
+ idle_sampling_helper_request_action(state->idle_sampling_helper_instance, grab_gvl_and_sample);
432
+ }
433
+
434
+ sleep_for(minimum_time_between_signals);
435
+
436
+ // The dynamic sampling rate module keeps track of how long samples are taking, and in here we extend our sleep time
437
+ // to take that into account.
438
+ // Note that we deliberately should NOT combine this sleep_for with the one above because the result of
439
+ // `dynamic_sampling_rate_get_sleep` may have changed while the above sleep was ongoing.
440
+ uint64_t extra_sleep =
441
+ dynamic_sampling_rate_get_sleep(&state->dynamic_sampling_rate, monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE));
442
+ if (extra_sleep > 0) sleep_for(extra_sleep);
363
443
  }
364
444
 
365
445
  return NULL; // Unused
@@ -369,14 +449,14 @@ static void *run_sampling_trigger_loop(void *state_ptr) {
369
449
  static void interrupt_sampling_trigger_loop(void *state_ptr) {
370
450
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
371
451
 
372
- state->should_run = false;
452
+ atomic_store(&state->should_run, false);
373
453
  }
374
454
 
375
455
  static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
376
- VALUE instance = active_sampler_instance; // Read from global variable
456
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
377
457
 
378
458
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
379
- if (instance == Qnil) return;
459
+ if (state == NULL) return;
380
460
 
381
461
  // @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
382
462
  // it's running on the main Ractor, but just in case...
@@ -384,14 +464,45 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
384
464
  return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
385
465
  }
386
466
 
467
+ // Rescue against any exceptions that happen during sampling
468
+ safely_call(rescued_sample_from_postponed_job, state->self_instance, state->self_instance);
469
+ }
470
+
471
+ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {
387
472
  struct cpu_and_wall_time_worker_state *state;
388
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
473
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
389
474
 
390
- // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
391
- safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
475
+ long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
476
+
477
+ if (!dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
478
+ // TODO: Add a counter for this
479
+ return Qnil;
480
+ }
481
+
482
+ state->stats.sampled++;
483
+
484
+ cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample);
485
+
486
+ long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
487
+ long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
488
+
489
+ // Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
490
+ uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;
491
+
492
+ state->stats.sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.sampling_time_ns_min);
493
+ state->stats.sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.sampling_time_ns_max);
494
+ state->stats.sampling_time_ns_total += sampling_time_ns;
495
+
496
+ dynamic_sampling_rate_after_sample(&state->dynamic_sampling_rate, wall_time_ns_after_sample, sampling_time_ns);
497
+
498
+ // Return a dummy VALUE because we're called from rb_rescue2 which requires it
499
+ return Qnil;
392
500
  }
393
501
 
394
- static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
502
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
503
+ stop(self_instance, exception);
504
+ return Qnil;
505
+ }
395
506
 
396
507
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
397
508
  // It SHOULD NOT be used for other purposes.
@@ -403,6 +514,8 @@ static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
403
514
 
404
515
  if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) {
405
516
  return ID2SYM(rb_intern("profiling"));
517
+ } else if (existing_signal_handler_config.sa_sigaction == empty_signal_handler) {
518
+ return ID2SYM(rb_intern("empty"));
406
519
  } else if (existing_signal_handler_config.sa_sigaction != NULL) {
407
520
  return ID2SYM(rb_intern("other"));
408
521
  } else {
@@ -414,6 +527,11 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
414
527
  struct cpu_and_wall_time_worker_state *state;
415
528
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
416
529
 
530
+ // Final preparations: Setup signal handler and enable tracepoint. We run these here and not in `_native_sampling_loop`
531
+ // because they may raise exceptions.
532
+ install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
533
+ if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
534
+
417
535
  rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
418
536
 
419
537
  // If we stopped sampling due to an exception, re-raise it (now in the worker thread)
@@ -425,9 +543,9 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
425
543
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
426
544
  // It SHOULD NOT be used for other purposes.
427
545
  static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
428
- return \
429
- (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
430
- Qtrue : Qfalse;
546
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
547
+
548
+ return (state != NULL && is_thread_alive(state->owner_thread) && state->self_instance == instance) ? Qtrue : Qfalse;
431
549
  }
432
550
 
433
551
  static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
@@ -437,7 +555,7 @@ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
437
555
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
438
556
  // It SHOULD NOT be used for other purposes.
439
557
  static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
440
- install_sigprof_signal_handler(testing_signal_handler);
558
+ install_sigprof_signal_handler(testing_signal_handler, "testing_signal_handler");
441
559
  return Qtrue;
442
560
  }
443
561
 
@@ -485,16 +603,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
485
603
  int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
486
604
  if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
487
605
 
488
- VALUE instance = active_sampler_instance; // Read from global variable
606
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
489
607
 
490
608
  // This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
491
609
  // and disabled before it is cleared, but just in case...
492
- if (instance == Qnil) return;
493
-
494
- struct cpu_and_wall_time_worker_state *state;
495
- if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
496
- // This should never fail the the above check passes
497
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
610
+ if (state == NULL) return;
498
611
 
499
612
  if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
500
613
  cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
@@ -517,15 +630,18 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
517
630
  cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
518
631
  // We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
519
632
  // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
633
+ //
634
+ // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
635
+ // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
520
636
  rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
521
637
  }
522
638
  }
523
639
 
524
640
  static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
525
- VALUE instance = active_sampler_instance; // Read from global variable
641
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
526
642
 
527
643
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
528
- if (instance == Qnil) return;
644
+ if (state == NULL) return;
529
645
 
530
646
  // @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
531
647
  // it's running on the main Ractor, but just in case...
@@ -533,18 +649,15 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
533
649
  return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
534
650
  }
535
651
 
536
- struct cpu_and_wall_time_worker_state *state;
537
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
538
-
539
652
  // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
540
- safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, instance);
653
+ safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, state->self_instance);
541
654
  }
542
655
 
543
656
  // Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
544
657
  // exception gets raised within
545
- static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
658
+ static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
546
659
  VALUE exception_handler_function_arg = instance;
547
- rb_rescue2(
660
+ return rb_rescue2(
548
661
  function_to_call_safely,
549
662
  function_to_call_safely_arg,
550
663
  handle_sampling_failure,
@@ -567,3 +680,97 @@ static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE sel
567
680
  sample_from_postponed_job(NULL);
568
681
  return Qtrue;
569
682
  }
683
+
684
+ // After the Ruby VM forks, this method gets called in the child process to clean up any leftover state from the parent.
685
+ //
686
+ // Assumption: This method gets called BEFORE restarting profiling. Note that profiling-related tracepoints may still
687
+ // be active, so we make sure to disable them before calling into anything else, so that there are no components
688
+ // attempting to trigger samples at the same time as the reset is done.
689
+ //
690
+ // In the future, if we add more other components with tracepoints, we will need to coordinate stopping all such
691
+ // tracepoints before doing the other cleaning steps.
692
+ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance) {
693
+ struct cpu_and_wall_time_worker_state *state;
694
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
695
+
696
+ // Disable all tracepoints, so that there are no more attempts to mutate the profile
697
+ rb_tracepoint_disable(state->gc_tracepoint);
698
+
699
+ reset_stats(state);
700
+
701
+ // Remove all state from the `Collectors::CpuAndWallTime` and connected downstream components
702
+ rb_funcall(state->cpu_and_wall_time_collector_instance, rb_intern("reset_after_fork"), 0);
703
+
704
+ return Qtrue;
705
+ }
706
+
707
+ static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self) {
708
+ return is_sigprof_blocked_in_current_thread();
709
+ }
710
+
711
+ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
712
+ struct cpu_and_wall_time_worker_state *state;
713
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
714
+
715
+ VALUE pretty_sampling_time_ns_min = state->stats.sampling_time_ns_min == UINT64_MAX ? Qnil : ULL2NUM(state->stats.sampling_time_ns_min);
716
+ VALUE pretty_sampling_time_ns_max = state->stats.sampling_time_ns_max == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_max);
717
+ VALUE pretty_sampling_time_ns_total = state->stats.sampling_time_ns_total == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_total);
718
+ VALUE pretty_sampling_time_ns_avg =
719
+ state->stats.sampled == 0 ? Qnil : DBL2NUM(((double) state->stats.sampling_time_ns_total) / state->stats.sampled);
720
+
721
+ VALUE stats_as_hash = rb_hash_new();
722
+ VALUE arguments[] = {
723
+ ID2SYM(rb_intern("trigger_sample_attempts")), /* => */ UINT2NUM(state->stats.trigger_sample_attempts),
724
+ ID2SYM(rb_intern("trigger_simulated_signal_delivery_attempts")), /* => */ UINT2NUM(state->stats.trigger_simulated_signal_delivery_attempts),
725
+ ID2SYM(rb_intern("simulated_signal_delivery")), /* => */ UINT2NUM(state->stats.simulated_signal_delivery),
726
+ ID2SYM(rb_intern("signal_handler_enqueued_sample")), /* => */ UINT2NUM(state->stats.signal_handler_enqueued_sample),
727
+ ID2SYM(rb_intern("signal_handler_wrong_thread")), /* => */ UINT2NUM(state->stats.signal_handler_wrong_thread),
728
+ ID2SYM(rb_intern("sampled")), /* => */ UINT2NUM(state->stats.sampled),
729
+ ID2SYM(rb_intern("sampling_time_ns_min")), /* => */ pretty_sampling_time_ns_min,
730
+ ID2SYM(rb_intern("sampling_time_ns_max")), /* => */ pretty_sampling_time_ns_max,
731
+ ID2SYM(rb_intern("sampling_time_ns_total")), /* => */ pretty_sampling_time_ns_total,
732
+ ID2SYM(rb_intern("sampling_time_ns_avg")), /* => */ pretty_sampling_time_ns_avg,
733
+ };
734
+ for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
735
+ return stats_as_hash;
736
+ }
737
+
738
+ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused) {
739
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
740
+
741
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the IdleSamplingHelper was trying to execute this action
742
+ if (state == NULL) return NULL;
743
+
744
+ state->stats.simulated_signal_delivery++;
745
+
746
+ // @ivoanjo: We could instead directly call sample_from_postponed_job, but I chose to go through the signal handler
747
+ // so that the simulated case is as close to the original one as well (including any metrics increases, etc).
748
+ handle_sampling_signal(0, NULL, NULL);
749
+
750
+ return NULL; // Unused
751
+ }
752
+
753
+ static void grab_gvl_and_sample(void) { rb_thread_call_with_gvl(simulate_sampling_signal_delivery, NULL); }
754
+
755
+ static void reset_stats(struct cpu_and_wall_time_worker_state *state) {
756
+ state->stats = (struct stats) {}; // Resets all stats back to zero
757
+ state->stats.sampling_time_ns_min = UINT64_MAX; // Since we always take the min between existing and latest sample
758
+ }
759
+
760
+ static void sleep_for(uint64_t time_ns) {
761
+ // As a simplification, we currently only support setting .tv_nsec
762
+ if (time_ns >= SECONDS_AS_NS(1)) {
763
+ grab_gvl_and_raise(rb_eArgError, "sleep_for can only sleep for less than 1 second, time_ns: %"PRIu64, time_ns);
764
+ }
765
+
766
+ struct timespec time_to_sleep = {.tv_nsec = time_ns};
767
+
768
+ while (nanosleep(&time_to_sleep, &time_to_sleep) != 0) {
769
+ if (errno == EINTR) {
770
+ // We were interrupted. nanosleep updates "time_to_sleep" to contain only the remaining time, so we just let the
771
+ // loop keep going.
772
+ } else {
773
+ ENFORCE_SUCCESS_NO_GVL(errno);
774
+ }
775
+ }
776
+ }