ddtrace 1.7.0 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (182) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +100 -1
  3. data/README.md +2 -2
  4. data/ext/ddtrace_profiling_loader/extconf.rb +4 -1
  5. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +1 -1
  6. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +3 -2
  7. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +24 -50
  8. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
  9. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +284 -74
  10. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c +142 -0
  11. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h +14 -0
  12. data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.c +241 -0
  13. data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.h +3 -0
  14. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +32 -32
  15. data/ext/ddtrace_profiling_native_extension/collectors_stack.h +2 -2
  16. data/ext/ddtrace_profiling_native_extension/extconf.rb +21 -7
  17. data/ext/ddtrace_profiling_native_extension/helpers.h +5 -0
  18. data/ext/ddtrace_profiling_native_extension/http_transport.c +50 -49
  19. data/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h +5 -1
  20. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +42 -12
  21. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +116 -22
  22. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +9 -0
  23. data/ext/ddtrace_profiling_native_extension/profiling.c +205 -0
  24. data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +86 -0
  25. data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +28 -6
  26. data/ext/ddtrace_profiling_native_extension/setup_signal_handler.c +23 -4
  27. data/ext/ddtrace_profiling_native_extension/setup_signal_handler.h +4 -0
  28. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +47 -50
  29. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +4 -4
  30. data/ext/ddtrace_profiling_native_extension/time_helpers.c +17 -0
  31. data/ext/ddtrace_profiling_native_extension/time_helpers.h +10 -0
  32. data/lib/datadog/appsec/assets/waf_rules/recommended.json +75 -8
  33. data/lib/datadog/appsec/assets/waf_rules/risky.json +1 -1
  34. data/lib/datadog/appsec/assets/waf_rules/strict.json +1 -1
  35. data/lib/datadog/appsec/assets.rb +1 -1
  36. data/lib/datadog/appsec/configuration/settings.rb +35 -22
  37. data/lib/datadog/appsec/configuration.rb +4 -2
  38. data/lib/datadog/appsec/contrib/auto_instrument.rb +1 -1
  39. data/lib/datadog/appsec/contrib/configuration/settings.rb +1 -1
  40. data/lib/datadog/appsec/contrib/integration.rb +1 -1
  41. data/lib/datadog/appsec/contrib/patcher.rb +1 -1
  42. data/lib/datadog/appsec/contrib/rack/configuration/settings.rb +1 -1
  43. data/lib/datadog/appsec/contrib/rack/ext.rb +1 -1
  44. data/lib/datadog/appsec/contrib/rack/gateway/watcher.rb +1 -1
  45. data/lib/datadog/appsec/contrib/rack/reactive/request.rb +1 -1
  46. data/lib/datadog/appsec/contrib/rack/reactive/request_body.rb +1 -1
  47. data/lib/datadog/appsec/contrib/rack/reactive/response.rb +1 -1
  48. data/lib/datadog/appsec/contrib/rack/request.rb +1 -1
  49. data/lib/datadog/appsec/contrib/rack/response.rb +1 -1
  50. data/lib/datadog/appsec/contrib/rails/configuration/settings.rb +1 -1
  51. data/lib/datadog/appsec/contrib/rails/ext.rb +1 -1
  52. data/lib/datadog/appsec/contrib/rails/framework.rb +1 -1
  53. data/lib/datadog/appsec/contrib/rails/gateway/watcher.rb +1 -1
  54. data/lib/datadog/appsec/contrib/rails/reactive/action.rb +1 -1
  55. data/lib/datadog/appsec/contrib/rails/request.rb +1 -1
  56. data/lib/datadog/appsec/contrib/rails/request_middleware.rb +1 -1
  57. data/lib/datadog/appsec/contrib/sinatra/configuration/settings.rb +1 -1
  58. data/lib/datadog/appsec/contrib/sinatra/ext.rb +1 -1
  59. data/lib/datadog/appsec/contrib/sinatra/framework.rb +1 -1
  60. data/lib/datadog/appsec/contrib/sinatra/gateway/watcher.rb +1 -1
  61. data/lib/datadog/appsec/contrib/sinatra/reactive/routed.rb +1 -1
  62. data/lib/datadog/appsec/contrib/sinatra/request_middleware.rb +1 -1
  63. data/lib/datadog/appsec/event.rb +1 -1
  64. data/lib/datadog/appsec/extensions.rb +36 -26
  65. data/lib/datadog/appsec/instrumentation/gateway.rb +3 -3
  66. data/lib/datadog/appsec/processor.rb +15 -19
  67. data/lib/datadog/appsec/rate_limiter.rb +1 -1
  68. data/lib/datadog/appsec/reactive/address_hash.rb +1 -1
  69. data/lib/datadog/appsec/reactive/engine.rb +1 -1
  70. data/lib/datadog/appsec/reactive/operation.rb +2 -2
  71. data/lib/datadog/appsec/reactive/subscriber.rb +1 -1
  72. data/lib/datadog/appsec/response.rb +18 -9
  73. data/lib/datadog/appsec/utils/http/media_range.rb +201 -0
  74. data/lib/datadog/appsec/utils/http/media_type.rb +87 -0
  75. data/lib/datadog/appsec/utils/http.rb +9 -0
  76. data/lib/datadog/appsec/utils.rb +7 -0
  77. data/lib/datadog/appsec.rb +1 -1
  78. data/lib/datadog/ci/ext/environment.rb +57 -13
  79. data/lib/datadog/core/configuration/agent_settings_resolver.rb +2 -2
  80. data/lib/datadog/core/configuration/base.rb +3 -0
  81. data/lib/datadog/core/configuration/components.rb +27 -6
  82. data/lib/datadog/core/configuration/ext.rb +26 -0
  83. data/lib/datadog/core/configuration/option_definition.rb +11 -2
  84. data/lib/datadog/core/configuration/settings.rb +16 -341
  85. data/lib/datadog/core/diagnostics/environment_logger.rb +4 -3
  86. data/lib/datadog/core/diagnostics/health.rb +4 -22
  87. data/lib/datadog/core/environment/variable_helpers.rb +58 -10
  88. data/lib/datadog/core/metrics/client.rb +3 -2
  89. data/lib/datadog/core/metrics/ext.rb +0 -2
  90. data/lib/datadog/core/telemetry/collector.rb +1 -0
  91. data/lib/datadog/core/utils.rb +0 -21
  92. data/lib/datadog/core.rb +21 -1
  93. data/lib/datadog/kit/appsec/events.rb +75 -0
  94. data/lib/datadog/kit/enable_core_dumps.rb +1 -0
  95. data/lib/datadog/kit/identity.rb +8 -7
  96. data/lib/datadog/opentelemetry/api/context.rb +187 -0
  97. data/lib/datadog/opentelemetry/api/trace/span.rb +15 -0
  98. data/lib/datadog/opentelemetry/sdk/configurator.rb +38 -0
  99. data/lib/datadog/opentelemetry/sdk/id_generator.rb +27 -0
  100. data/lib/datadog/opentelemetry/sdk/propagator.rb +91 -0
  101. data/lib/datadog/opentelemetry/sdk/span_processor.rb +92 -0
  102. data/lib/datadog/opentelemetry.rb +48 -0
  103. data/lib/datadog/opentracer/distributed_headers.rb +2 -2
  104. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +16 -5
  105. data/lib/datadog/profiling/collectors/dynamic_sampling_rate.rb +14 -0
  106. data/lib/datadog/profiling/collectors/idle_sampling_helper.rb +68 -0
  107. data/lib/datadog/profiling/stack_recorder.rb +14 -0
  108. data/lib/datadog/profiling.rb +2 -0
  109. data/lib/datadog/tracing/configuration/ext.rb +33 -4
  110. data/lib/datadog/tracing/configuration/settings.rb +433 -0
  111. data/lib/datadog/tracing/contrib/aws/configuration/settings.rb +4 -1
  112. data/lib/datadog/tracing/contrib/aws/ext.rb +1 -0
  113. data/lib/datadog/tracing/contrib/dalli/configuration/settings.rb +4 -1
  114. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  115. data/lib/datadog/tracing/contrib/elasticsearch/configuration/settings.rb +5 -1
  116. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +1 -0
  117. data/lib/datadog/tracing/contrib/ethon/configuration/settings.rb +6 -1
  118. data/lib/datadog/tracing/contrib/ethon/ext.rb +1 -0
  119. data/lib/datadog/tracing/contrib/excon/configuration/settings.rb +5 -1
  120. data/lib/datadog/tracing/contrib/excon/ext.rb +1 -0
  121. data/lib/datadog/tracing/contrib/faraday/configuration/settings.rb +5 -1
  122. data/lib/datadog/tracing/contrib/faraday/ext.rb +1 -0
  123. data/lib/datadog/tracing/contrib/grpc/configuration/settings.rb +6 -1
  124. data/lib/datadog/tracing/contrib/grpc/distributed/propagation.rb +9 -4
  125. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  126. data/lib/datadog/tracing/contrib/http/configuration/settings.rb +11 -1
  127. data/lib/datadog/tracing/contrib/http/distributed/fetcher.rb +10 -3
  128. data/lib/datadog/tracing/contrib/http/distributed/propagation.rb +9 -4
  129. data/lib/datadog/tracing/contrib/http/ext.rb +2 -0
  130. data/lib/datadog/tracing/contrib/http/instrumentation.rb +3 -6
  131. data/lib/datadog/tracing/contrib/httpclient/configuration/settings.rb +11 -1
  132. data/lib/datadog/tracing/contrib/httpclient/ext.rb +2 -0
  133. data/lib/datadog/tracing/contrib/httpclient/instrumentation.rb +3 -4
  134. data/lib/datadog/tracing/contrib/httprb/configuration/settings.rb +11 -1
  135. data/lib/datadog/tracing/contrib/httprb/ext.rb +2 -0
  136. data/lib/datadog/tracing/contrib/httprb/instrumentation.rb +3 -4
  137. data/lib/datadog/tracing/contrib/mongodb/configuration/settings.rb +5 -1
  138. data/lib/datadog/tracing/contrib/mongodb/ext.rb +1 -0
  139. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +4 -1
  140. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  141. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +2 -2
  142. data/lib/datadog/tracing/contrib/patcher.rb +3 -2
  143. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +4 -1
  144. data/lib/datadog/tracing/contrib/pg/ext.rb +1 -0
  145. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +56 -33
  146. data/lib/datadog/tracing/contrib/presto/configuration/settings.rb +4 -1
  147. data/lib/datadog/tracing/contrib/presto/ext.rb +1 -0
  148. data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +1 -0
  149. data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +10 -12
  150. data/lib/datadog/tracing/contrib/redis/configuration/settings.rb +4 -1
  151. data/lib/datadog/tracing/contrib/redis/ext.rb +1 -0
  152. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +30 -23
  153. data/lib/datadog/tracing/contrib/redis/integration.rb +34 -2
  154. data/lib/datadog/tracing/contrib/redis/patcher.rb +18 -14
  155. data/lib/datadog/tracing/contrib/redis/quantize.rb +12 -9
  156. data/lib/datadog/tracing/contrib/redis/tags.rb +4 -6
  157. data/lib/datadog/tracing/contrib/redis/trace_middleware.rb +72 -0
  158. data/lib/datadog/tracing/contrib/rest_client/configuration/settings.rb +6 -1
  159. data/lib/datadog/tracing/contrib/rest_client/ext.rb +1 -0
  160. data/lib/datadog/tracing/contrib/stripe/configuration/settings.rb +33 -0
  161. data/lib/datadog/tracing/contrib/stripe/ext.rb +26 -0
  162. data/lib/datadog/tracing/contrib/stripe/integration.rb +43 -0
  163. data/lib/datadog/tracing/contrib/stripe/patcher.rb +29 -0
  164. data/lib/datadog/tracing/contrib/stripe/request.rb +67 -0
  165. data/lib/datadog/tracing/contrib.rb +1 -0
  166. data/lib/datadog/{core → tracing}/diagnostics/ext.rb +1 -6
  167. data/lib/datadog/tracing/diagnostics/health.rb +40 -0
  168. data/lib/datadog/tracing/distributed/{b3.rb → b3_multi.rb} +2 -2
  169. data/lib/datadog/tracing/distributed/helpers.rb +2 -1
  170. data/lib/datadog/tracing/distributed/none.rb +19 -0
  171. data/lib/datadog/tracing/distributed/trace_context.rb +378 -0
  172. data/lib/datadog/tracing/metadata/ext.rb +1 -1
  173. data/lib/datadog/tracing/metadata/tagging.rb +6 -0
  174. data/lib/datadog/tracing/sampling/priority_sampler.rb +11 -0
  175. data/lib/datadog/tracing/sampling/rate_sampler.rb +3 -3
  176. data/lib/datadog/tracing/span.rb +3 -19
  177. data/lib/datadog/tracing/span_operation.rb +5 -4
  178. data/lib/datadog/tracing/trace_digest.rb +85 -2
  179. data/lib/datadog/tracing/trace_operation.rb +13 -4
  180. data/lib/datadog/tracing/utils.rb +50 -0
  181. data/lib/ddtrace/version.rb +1 -1
  182. metadata +41 -9
@@ -3,12 +3,18 @@
3
3
  #include <ruby/thread_native.h>
4
4
  #include <ruby/debug.h>
5
5
  #include <stdbool.h>
6
+ #include <stdatomic.h>
6
7
  #include <signal.h>
8
+ #include <errno.h>
9
+
7
10
  #include "helpers.h"
8
11
  #include "ruby_helpers.h"
9
12
  #include "collectors_cpu_and_wall_time.h"
13
+ #include "collectors_dynamic_sampling_rate.h"
14
+ #include "collectors_idle_sampling_helper.h"
10
15
  #include "private_vm_api_access.h"
11
16
  #include "setup_signal_handler.h"
17
+ #include "time_helpers.h"
12
18
 
13
19
  // Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
14
20
  // itself; this class only implements the "doing it periodically" part.
@@ -71,19 +77,42 @@
71
77
 
72
78
  // Contains state for a single CpuAndWallTimeWorker instance
73
79
  struct cpu_and_wall_time_worker_state {
74
- // Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
75
- // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
76
- // atomic operations. stdatomic.h seems a nice thing to reach out for.
77
- volatile bool should_run;
80
+ atomic_bool should_run;
81
+
78
82
  bool gc_profiling_enabled;
83
+ VALUE self_instance;
79
84
  VALUE cpu_and_wall_time_collector_instance;
85
+ VALUE idle_sampling_helper_instance;
86
+ VALUE owner_thread;
87
+ dynamic_sampling_rate_state dynamic_sampling_rate;
80
88
 
81
89
  // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
82
90
  // the CpuAndWallTimeWorker thread
83
91
  VALUE failure_exception;
92
+ // Used by `_native_stop` to flag the worker thread to start (see comment on `_native_sampling_loop`)
93
+ VALUE stop_thread;
84
94
 
85
95
  // Used to get gc start/finish information
86
96
  VALUE gc_tracepoint;
97
+
98
+ struct stats {
99
+ // How many times we tried to trigger a sample
100
+ unsigned int trigger_sample_attempts;
101
+ // How many times we tried to simulate signal delivery
102
+ unsigned int trigger_simulated_signal_delivery_attempts;
103
+ // How many times we actually simulated signal delivery
104
+ unsigned int simulated_signal_delivery;
105
+ // How many times we actually called rb_postponed_job_register_one from a signal handler
106
+ unsigned int signal_handler_enqueued_sample;
107
+ // How many times the signal handler was called from the wrong thread
108
+ unsigned int signal_handler_wrong_thread;
109
+ // How many times we actually sampled (except GC samples)
110
+ unsigned int sampled;
111
+ // Min/max/total wall-time spent sampling (except GC samples)
112
+ uint64_t sampling_time_ns_min;
113
+ uint64_t sampling_time_ns_max;
114
+ uint64_t sampling_time_ns_total;
115
+ } stats;
87
116
  };
88
117
 
89
118
  static VALUE _native_new(VALUE klass);
@@ -91,16 +120,18 @@ static VALUE _native_initialize(
91
120
  DDTRACE_UNUSED VALUE _self,
92
121
  VALUE self_instance,
93
122
  VALUE cpu_and_wall_time_collector_instance,
94
- VALUE gc_profiling_enabled
123
+ VALUE gc_profiling_enabled,
124
+ VALUE idle_sampling_helper_instance
95
125
  );
96
126
  static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
97
127
  static VALUE _native_sampling_loop(VALUE self, VALUE instance);
98
- static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
128
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread);
99
129
  static VALUE stop(VALUE self_instance, VALUE optional_exception);
100
130
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
101
131
  static void *run_sampling_trigger_loop(void *state_ptr);
102
132
  static void interrupt_sampling_trigger_loop(void *state_ptr);
103
133
  static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
134
+ static VALUE rescued_sample_from_postponed_job(VALUE self_instance);
104
135
  static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
105
136
  static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
106
137
  static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
@@ -112,23 +143,30 @@ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
112
143
  static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
113
144
  static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
114
145
  static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
115
- static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
146
+ static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
116
147
  static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
117
148
  static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
118
149
  static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance);
119
-
120
- // Global state -- be very careful when accessing or modifying it
121
-
122
- // Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
123
- // The active_sampler_instance needs to be global because we access it from the signal handler.
150
+ static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self);
151
+ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance);
152
+ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused);
153
+ static void grab_gvl_and_sample(void);
154
+ static void reset_stats(struct cpu_and_wall_time_worker_state *state);
155
+ static void sleep_for(uint64_t time_ns);
156
+
157
+ // Note on sampler global state safety:
158
+ //
159
+ // Both `active_sampler_instance` and `active_sampler_instance_state` are **GLOBAL** state. Be careful when accessing
160
+ // or modifying them.
161
+ // In particular, it's important to only mutate them while holding the global VM lock, to ensure correctness.
162
+ //
163
+ // This global state is needed because a bunch of functions on this file need to access it from situations
164
+ // (e.g. signal handler) where it's impossible or just awkward to pass it as an argument.
124
165
  static VALUE active_sampler_instance = Qnil;
125
- // ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
126
- // to detect when it is outdated)
127
- static VALUE active_sampler_owner_thread = Qnil;
166
+ struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;
128
167
 
129
168
  void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
130
169
  rb_global_variable(&active_sampler_instance);
131
- rb_global_variable(&active_sampler_owner_thread);
132
170
 
133
171
  VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
134
172
  VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
@@ -145,10 +183,11 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
145
183
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
146
184
  rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
147
185
 
148
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
186
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 4);
149
187
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
150
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
188
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
151
189
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
190
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stats", _native_stats, 1);
152
191
  rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
153
192
  rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
154
193
  rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
@@ -157,6 +196,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
157
196
  rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
158
197
  rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
159
198
  rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
199
+ rb_define_singleton_method(testing_module, "_native_is_sigprof_blocked_in_current_thread", _native_is_sigprof_blocked_in_current_thread, 0);
160
200
  }
161
201
 
162
202
  // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
@@ -166,7 +206,7 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
166
206
  .function = {
167
207
  .dmark = cpu_and_wall_time_worker_typed_data_mark,
168
208
  .dfree = RUBY_DEFAULT_FREE,
169
- .dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
209
+ .dsize = NULL, // We don't track memory usage (although it'd be cool if we did!)
170
210
  //.dcompact = NULL, // FIXME: Add support for compaction
171
211
  },
172
212
  .flags = RUBY_TYPED_FREE_IMMEDIATELY
@@ -175,20 +215,26 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
175
215
  static VALUE _native_new(VALUE klass) {
176
216
  struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
177
217
 
178
- state->should_run = false;
218
+ atomic_init(&state->should_run, false);
179
219
  state->gc_profiling_enabled = false;
180
220
  state->cpu_and_wall_time_collector_instance = Qnil;
221
+ state->idle_sampling_helper_instance = Qnil;
222
+ state->owner_thread = Qnil;
223
+ dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
181
224
  state->failure_exception = Qnil;
225
+ state->stop_thread = Qnil;
182
226
  state->gc_tracepoint = Qnil;
227
+ reset_stats(state);
183
228
 
184
- return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
229
+ return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
185
230
  }
186
231
 
187
232
  static VALUE _native_initialize(
188
233
  DDTRACE_UNUSED VALUE _self,
189
234
  VALUE self_instance,
190
235
  VALUE cpu_and_wall_time_collector_instance,
191
- VALUE gc_profiling_enabled
236
+ VALUE gc_profiling_enabled,
237
+ VALUE idle_sampling_helper_instance
192
238
  ) {
193
239
  ENFORCE_BOOLEAN(gc_profiling_enabled);
194
240
 
@@ -197,6 +243,7 @@ static VALUE _native_initialize(
197
243
 
198
244
  state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
199
245
  state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
246
+ state->idle_sampling_helper_instance = idle_sampling_helper_instance;
200
247
  state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
201
248
 
202
249
  return Qtrue;
@@ -207,7 +254,10 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
207
254
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
208
255
 
209
256
  rb_gc_mark(state->cpu_and_wall_time_collector_instance);
257
+ rb_gc_mark(state->idle_sampling_helper_instance);
258
+ rb_gc_mark(state->owner_thread);
210
259
  rb_gc_mark(state->failure_exception);
260
+ rb_gc_mark(state->stop_thread);
211
261
  rb_gc_mark(state->gc_tracepoint);
212
262
  }
213
263
 
@@ -216,8 +266,9 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
216
266
  struct cpu_and_wall_time_worker_state *state;
217
267
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
218
268
 
219
- if (active_sampler_owner_thread != Qnil) {
220
- if (is_thread_alive(active_sampler_owner_thread)) {
269
+ struct cpu_and_wall_time_worker_state *old_state = active_sampler_instance_state;
270
+ if (old_state != NULL) {
271
+ if (is_thread_alive(old_state->owner_thread)) {
221
272
  rb_raise(
222
273
  rb_eRuntimeError,
223
274
  "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
@@ -231,23 +282,26 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
231
282
  // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
232
283
  // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
233
284
 
234
- struct cpu_and_wall_time_worker_state *old_state;
235
- TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
236
285
  rb_tracepoint_disable(old_state->gc_tracepoint);
237
286
  }
238
287
  }
239
288
 
289
+ // We use `stop_thread` to distinguish when `_native_stop` was called before we actually had a chance to start. In this
290
+ // situation we stop immediately and never even start the sampling trigger loop.
291
+ if (state->stop_thread == rb_thread_current()) return Qnil;
292
+
293
+ // Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
294
+ dynamic_sampling_rate_reset(&state->dynamic_sampling_rate);
295
+
240
296
  // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
297
+ active_sampler_instance_state = state;
241
298
  active_sampler_instance = instance;
242
- active_sampler_owner_thread = rb_thread_current();
299
+ state->owner_thread = rb_thread_current();
243
300
 
244
- state->should_run = true;
301
+ atomic_store(&state->should_run, true);
245
302
 
246
303
  block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
247
304
 
248
- install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
249
- if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
250
-
251
305
  // Release GVL, get to the actual work!
252
306
  int exception_state;
253
307
  rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
@@ -256,6 +310,19 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
256
310
 
257
311
  rb_tracepoint_disable(state->gc_tracepoint);
258
312
 
313
+ active_sampler_instance_state = NULL;
314
+ active_sampler_instance = Qnil;
315
+ state->owner_thread = Qnil;
316
+
317
+ // If this `Thread` is about to die, why is this important? It's because Ruby caches native threads for a period after
318
+ // the `Thread` dies, and reuses them if a new Ruby `Thread` gets created. This means that while conceptually the
319
+ // worker background `Thread` is about to die, the low-level native OS thread can be reused for something else in the Ruby app.
320
+ // Then, the reused thread would "inherit" the SIGPROF blocking, which is... really unexpected.
321
+ // This actually caused a flaky test -- the `native_extension_spec.rb` creates a `Thread` and tries to specifically
322
+ // send SIGPROF signals to it, and oops it could fail if it got the reused native thread from the worker which still
323
+ // had SIGPROF delivery blocked. :hide_the_pain_harold:
324
+ unblock_sigprof_signal_handler_from_running_in_current_thread();
325
+
259
326
  // Why replace and not use remove the signal handler? We do this because when a process receives a SIGPROF without
260
327
  // having an explicit signal handler set up, the process will instantly terminate with a confusing
261
328
  // "Profiling timer expired" message left behind. (This message doesn't come from us -- it's the default message for
@@ -265,11 +332,10 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
265
332
  // profiler-sent signals by the time we get here and want to clean up.
266
333
  // @ivoanjo: I suspect this will never happen, but the cost of getting it wrong is really high (VM terminates) so this
267
334
  // is a just-in-case situation.
335
+ //
336
+ // Note 2: This can raise exceptions as well, so make sure that all cleanups are done by the time we get here.
268
337
  replace_sigprof_signal_handler_with_empty_handler(handle_sampling_signal);
269
338
 
270
- active_sampler_instance = Qnil;
271
- active_sampler_owner_thread = Qnil;
272
-
273
339
  // Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
274
340
  RB_GC_GUARD(instance);
275
341
 
@@ -278,7 +344,12 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
278
344
  return Qnil;
279
345
  }
280
346
 
281
- static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
347
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread) {
348
+ struct cpu_and_wall_time_worker_state *state;
349
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
350
+
351
+ state->stop_thread = worker_thread;
352
+
282
353
  return stop(self_instance, /* optional_exception: */ Qnil);
283
354
  }
284
355
 
@@ -286,7 +357,7 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
286
357
  struct cpu_and_wall_time_worker_state *state;
287
358
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
288
359
 
289
- state->should_run = false;
360
+ atomic_store(&state->should_run, false);
290
361
  state->failure_exception = optional_exception;
291
362
 
292
363
  // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
@@ -299,41 +370,76 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
299
370
  // We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
300
371
  // we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
301
372
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
302
- if (!ruby_thread_has_gvl_p()) {
303
- return; // Not safe to enqueue a sample from this thread
304
- }
305
- if (!ddtrace_rb_ractor_main_p()) {
306
- return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
373
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
374
+
375
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the signal delivery was happening; nothing to do
376
+ if (state == NULL) return;
377
+
378
+ if (
379
+ !ruby_native_thread_p() || // Not a Ruby thread
380
+ !is_current_thread_holding_the_gvl() || // Not safe to enqueue a sample from this thread
381
+ !ddtrace_rb_ractor_main_p() // We're not on the main Ractor; we currently don't support profiling non-main Ractors
382
+ ) {
383
+ state->stats.signal_handler_wrong_thread++;
384
+ return;
307
385
  }
308
386
 
309
387
  // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
310
388
  // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
311
389
  // b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
312
390
  // because it will not have the global VM lock
313
- // TODO: Validate that this does not impact Ractors
314
391
 
315
392
  // Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
316
393
  // then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
317
394
  // may want to implement that check ourselves.
318
395
 
319
- // TODO: Do something with result (potentially update tracking counters?)
396
+ state->stats.signal_handler_enqueued_sample++;
397
+
398
+ // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
399
+ // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
320
400
  /*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
401
+ // TODO: Do something with result (potentially update tracking counters?)
321
402
  }
322
403
 
323
404
  // The actual sampling trigger loop always runs **without** the global vm lock.
324
405
  static void *run_sampling_trigger_loop(void *state_ptr) {
325
406
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
326
407
 
327
- struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
408
+ uint64_t minimum_time_between_signals = MILLIS_AS_NS(10);
409
+
410
+ while (atomic_load(&state->should_run)) {
411
+ state->stats.trigger_sample_attempts++;
328
412
 
329
- while (state->should_run) {
330
413
  // TODO: This is still a placeholder for a more complex mechanism. In particular:
331
- // * We want to signal a particular thread or threads, not the process in general
332
- // * We want to track if a signal landed on the thread holding the global VM lock and do something about it
333
414
  // * We want to do more than having a fixed sampling rate
334
415
 
335
- kill(getpid(), SIGPROF);
336
- nanosleep(&time_between_signals, NULL);
416
+ current_gvl_owner owner = gvl_owner();
417
+ if (owner.valid) {
418
+ // Note that reading the GVL owner and sending them a signal is a race -- the Ruby VM keeps on executing while
419
+ // we're doing this, so we may still not signal the correct thread from time to time, but our signal handler
420
+ // includes a check to see if it got called in the right thread
421
+ pthread_kill(owner.owner, SIGPROF);
422
+ } else {
423
+ // If no thread owns the Global VM Lock, the application is probably idle at the moment. We still want to sample
424
+ // so we "ask a friend" (the IdleSamplingHelper component) to grab the GVL and simulate getting a SIGPROF.
425
+ //
426
+ // In a previous version of the code, we called `grab_gvl_and_sample` directly BUT this was problematic because
427
+ // Ruby may concurrently get busy and so the CpuAndWallTimeWorker would be blocked in line to acquire the GVL
428
+ // for an uncontrolled amount of time. (This can still happen to the IdleSamplingHelper, but the
429
+ // CpuAndWallTimeWorker will still be free to interrupt the Ruby VM and keep sampling for the entire blocking period).
430
+ state->stats.trigger_simulated_signal_delivery_attempts++;
431
+ idle_sampling_helper_request_action(state->idle_sampling_helper_instance, grab_gvl_and_sample);
432
+ }
433
+
434
+ sleep_for(minimum_time_between_signals);
435
+
436
+ // The dynamic sampling rate module keeps track of how long samples are taking, and in here we extend our sleep time
437
+ // to take that into account.
438
+ // Note that we deliberately should NOT combine this sleep_for with the one above because the result of
439
+ // `dynamic_sampling_rate_get_sleep` may have changed while the above sleep was ongoing.
440
+ uint64_t extra_sleep =
441
+ dynamic_sampling_rate_get_sleep(&state->dynamic_sampling_rate, monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE));
442
+ if (extra_sleep > 0) sleep_for(extra_sleep);
337
443
  }
338
444
 
339
445
  return NULL; // Unused
@@ -343,14 +449,14 @@ static void *run_sampling_trigger_loop(void *state_ptr) {
343
449
  static void interrupt_sampling_trigger_loop(void *state_ptr) {
344
450
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
345
451
 
346
- state->should_run = false;
452
+ atomic_store(&state->should_run, false);
347
453
  }
348
454
 
349
455
  static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
350
- VALUE instance = active_sampler_instance; // Read from global variable
456
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
351
457
 
352
458
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
353
- if (instance == Qnil) return;
459
+ if (state == NULL) return;
354
460
 
355
461
  // @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
356
462
  // it's running on the main Ractor, but just in case...
@@ -358,14 +464,45 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
358
464
  return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
359
465
  }
360
466
 
467
+ // Rescue against any exceptions that happen during sampling
468
+ safely_call(rescued_sample_from_postponed_job, state->self_instance, state->self_instance);
469
+ }
470
+
471
+ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {
361
472
  struct cpu_and_wall_time_worker_state *state;
362
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
473
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
363
474
 
364
- // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
365
- safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
475
+ long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
476
+
477
+ if (!dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
478
+ // TODO: Add a counter for this
479
+ return Qnil;
480
+ }
481
+
482
+ state->stats.sampled++;
483
+
484
+ cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample);
485
+
486
+ long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
487
+ long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
488
+
489
+ // Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
490
+ uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;
491
+
492
+ state->stats.sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.sampling_time_ns_min);
493
+ state->stats.sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.sampling_time_ns_max);
494
+ state->stats.sampling_time_ns_total += sampling_time_ns;
495
+
496
+ dynamic_sampling_rate_after_sample(&state->dynamic_sampling_rate, wall_time_ns_after_sample, sampling_time_ns);
497
+
498
+ // Return a dummy VALUE because we're called from rb_rescue2 which requires it
499
+ return Qnil;
366
500
  }
367
501
 
368
- static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
502
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
503
+ stop(self_instance, exception);
504
+ return Qnil;
505
+ }
369
506
 
370
507
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
371
508
  // It SHOULD NOT be used for other purposes.
@@ -390,6 +527,11 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
390
527
  struct cpu_and_wall_time_worker_state *state;
391
528
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
392
529
 
530
+ // Final preparations: Setup signal handler and enable tracepoint. We run these here and not in `_native_sampling_loop`
531
+ // because they may raise exceptions.
532
+ install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
533
+ if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
534
+
393
535
  rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
394
536
 
395
537
  // If we stopped sampling due to an exception, re-raise it (now in the worker thread)
@@ -401,9 +543,9 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
401
543
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
402
544
  // It SHOULD NOT be used for other purposes.
403
545
  static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
404
- return \
405
- (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
406
- Qtrue : Qfalse;
546
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
547
+
548
+ return (state != NULL && is_thread_alive(state->owner_thread) && state->self_instance == instance) ? Qtrue : Qfalse;
407
549
  }
408
550
 
409
551
  static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
@@ -461,16 +603,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
461
603
  int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
462
604
  if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
463
605
 
464
- VALUE instance = active_sampler_instance; // Read from global variable
606
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
465
607
 
466
608
  // This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
467
609
  // and disabled before it is cleared, but just in case...
468
- if (instance == Qnil) return;
469
-
470
- struct cpu_and_wall_time_worker_state *state;
471
- if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
472
- // This should never fail the the above check passes
473
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
610
+ if (state == NULL) return;
474
611
 
475
612
  if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
476
613
  cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
@@ -493,15 +630,18 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
493
630
  cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
494
631
  // We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
495
632
  // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
633
+ //
634
+ // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
635
+ // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
496
636
  rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
497
637
  }
498
638
  }
499
639
 
500
640
  static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
501
- VALUE instance = active_sampler_instance; // Read from global variable
641
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
502
642
 
503
643
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
504
- if (instance == Qnil) return;
644
+ if (state == NULL) return;
505
645
 
506
646
  // @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
507
647
  // it's running on the main Ractor, but just in case...
@@ -509,18 +649,15 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
509
649
  return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
510
650
  }
511
651
 
512
- struct cpu_and_wall_time_worker_state *state;
513
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
514
-
515
652
  // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
516
- safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, instance);
653
+ safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, state->self_instance);
517
654
  }
518
655
 
519
656
  // Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
520
657
  // exception gets raised within
521
- static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
658
+ static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
522
659
  VALUE exception_handler_function_arg = instance;
523
- rb_rescue2(
660
+ return rb_rescue2(
524
661
  function_to_call_safely,
525
662
  function_to_call_safely_arg,
526
663
  handle_sampling_failure,
@@ -559,8 +696,81 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance)
559
696
  // Disable all tracepoints, so that there are no more attempts to mutate the profile
560
697
  rb_tracepoint_disable(state->gc_tracepoint);
561
698
 
699
+ reset_stats(state);
700
+
562
701
  // Remove all state from the `Collectors::CpuAndWallTime` and connected downstream components
563
702
  rb_funcall(state->cpu_and_wall_time_collector_instance, rb_intern("reset_after_fork"), 0);
564
703
 
565
704
  return Qtrue;
566
705
  }
706
+
707
+ static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self) {
708
+ return is_sigprof_blocked_in_current_thread();
709
+ }
710
+
711
+ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
712
+ struct cpu_and_wall_time_worker_state *state;
713
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
714
+
715
+ VALUE pretty_sampling_time_ns_min = state->stats.sampling_time_ns_min == UINT64_MAX ? Qnil : ULL2NUM(state->stats.sampling_time_ns_min);
716
+ VALUE pretty_sampling_time_ns_max = state->stats.sampling_time_ns_max == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_max);
717
+ VALUE pretty_sampling_time_ns_total = state->stats.sampling_time_ns_total == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_total);
718
+ VALUE pretty_sampling_time_ns_avg =
719
+ state->stats.sampled == 0 ? Qnil : DBL2NUM(((double) state->stats.sampling_time_ns_total) / state->stats.sampled);
720
+
721
+ VALUE stats_as_hash = rb_hash_new();
722
+ VALUE arguments[] = {
723
+ ID2SYM(rb_intern("trigger_sample_attempts")), /* => */ UINT2NUM(state->stats.trigger_sample_attempts),
724
+ ID2SYM(rb_intern("trigger_simulated_signal_delivery_attempts")), /* => */ UINT2NUM(state->stats.trigger_simulated_signal_delivery_attempts),
725
+ ID2SYM(rb_intern("simulated_signal_delivery")), /* => */ UINT2NUM(state->stats.simulated_signal_delivery),
726
+ ID2SYM(rb_intern("signal_handler_enqueued_sample")), /* => */ UINT2NUM(state->stats.signal_handler_enqueued_sample),
727
+ ID2SYM(rb_intern("signal_handler_wrong_thread")), /* => */ UINT2NUM(state->stats.signal_handler_wrong_thread),
728
+ ID2SYM(rb_intern("sampled")), /* => */ UINT2NUM(state->stats.sampled),
729
+ ID2SYM(rb_intern("sampling_time_ns_min")), /* => */ pretty_sampling_time_ns_min,
730
+ ID2SYM(rb_intern("sampling_time_ns_max")), /* => */ pretty_sampling_time_ns_max,
731
+ ID2SYM(rb_intern("sampling_time_ns_total")), /* => */ pretty_sampling_time_ns_total,
732
+ ID2SYM(rb_intern("sampling_time_ns_avg")), /* => */ pretty_sampling_time_ns_avg,
733
+ };
734
+ for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
735
+ return stats_as_hash;
736
+ }
737
+
738
+ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused) {
739
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
740
+
741
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the IdleSamplingHelper was trying to execute this action
742
+ if (state == NULL) return NULL;
743
+
744
+ state->stats.simulated_signal_delivery++;
745
+
746
+ // @ivoanjo: We could instead directly call sample_from_postponed_job, but I chose to go through the signal handler
747
+ // so that the simulated case is as close to the original one as well (including any metrics increases, etc).
748
+ handle_sampling_signal(0, NULL, NULL);
749
+
750
+ return NULL; // Unused
751
+ }
752
+
753
+ static void grab_gvl_and_sample(void) { rb_thread_call_with_gvl(simulate_sampling_signal_delivery, NULL); }
754
+
755
+ static void reset_stats(struct cpu_and_wall_time_worker_state *state) {
756
+ state->stats = (struct stats) {}; // Resets all stats back to zero
757
+ state->stats.sampling_time_ns_min = UINT64_MAX; // Since we always take the min between existing and latest sample
758
+ }
759
+
760
+ static void sleep_for(uint64_t time_ns) {
761
+ // As a simplification, we currently only support setting .tv_nsec
762
+ if (time_ns >= SECONDS_AS_NS(1)) {
763
+ grab_gvl_and_raise(rb_eArgError, "sleep_for can only sleep for less than 1 second, time_ns: %"PRIu64, time_ns);
764
+ }
765
+
766
+ struct timespec time_to_sleep = {.tv_nsec = time_ns};
767
+
768
+ while (nanosleep(&time_to_sleep, &time_to_sleep) != 0) {
769
+ if (errno == EINTR) {
770
+ // We were interrupted. nanosleep updates "time_to_sleep" to contain only the remaining time, so we just let the
771
+ // loop keep going.
772
+ } else {
773
+ ENFORCE_SUCCESS_NO_GVL(errno);
774
+ }
775
+ }
776
+ }