ddtrace 1.6.1 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +89 -2
  3. data/README.md +2 -2
  4. data/ext/ddtrace_profiling_loader/extconf.rb +5 -2
  5. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +1 -1
  6. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +3 -2
  7. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +81 -47
  8. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +1 -1
  9. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +332 -125
  10. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c +142 -0
  11. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h +14 -0
  12. data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.c +241 -0
  13. data/ext/ddtrace_profiling_native_extension/collectors_idle_sampling_helper.h +3 -0
  14. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +11 -13
  15. data/ext/ddtrace_profiling_native_extension/extconf.rb +22 -8
  16. data/ext/ddtrace_profiling_native_extension/helpers.h +5 -0
  17. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +8 -0
  18. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +111 -26
  19. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +9 -0
  20. data/ext/ddtrace_profiling_native_extension/profiling.c +205 -0
  21. data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +86 -0
  22. data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +28 -6
  23. data/ext/ddtrace_profiling_native_extension/setup_signal_handler.c +115 -0
  24. data/ext/ddtrace_profiling_native_extension/setup_signal_handler.h +11 -0
  25. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +84 -35
  26. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +1 -0
  27. data/ext/ddtrace_profiling_native_extension/time_helpers.c +17 -0
  28. data/ext/ddtrace_profiling_native_extension/time_helpers.h +10 -0
  29. data/lib/datadog/appsec/assets/blocked.html +98 -3
  30. data/lib/datadog/appsec/assets/blocked.json +1 -0
  31. data/lib/datadog/appsec/assets/blocked.text +5 -0
  32. data/lib/datadog/appsec/assets/waf_rules/recommended.json +35 -46
  33. data/lib/datadog/appsec/assets/waf_rules/risky.json +1 -1
  34. data/lib/datadog/appsec/assets/waf_rules/strict.json +46 -1
  35. data/lib/datadog/appsec/assets.rb +2 -2
  36. data/lib/datadog/appsec/configuration/settings.rb +6 -0
  37. data/lib/datadog/appsec/configuration.rb +4 -0
  38. data/lib/datadog/appsec/contrib/rack/reactive/request.rb +4 -8
  39. data/lib/datadog/appsec/contrib/rack/request.rb +17 -0
  40. data/lib/datadog/appsec/contrib/rack/request_body_middleware.rb +2 -2
  41. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +2 -2
  42. data/lib/datadog/appsec/contrib/rails/patcher.rb +3 -6
  43. data/lib/datadog/appsec/contrib/sinatra/ext.rb +1 -0
  44. data/lib/datadog/appsec/contrib/sinatra/gateway/watcher.rb +1 -1
  45. data/lib/datadog/appsec/contrib/sinatra/patcher.rb +11 -8
  46. data/lib/datadog/appsec/extensions.rb +10 -0
  47. data/lib/datadog/appsec/processor.rb +18 -0
  48. data/lib/datadog/appsec/response.rb +54 -0
  49. data/lib/datadog/core/configuration/components.rb +27 -6
  50. data/lib/datadog/core/configuration/ext.rb +18 -0
  51. data/lib/datadog/core/configuration/settings.rb +14 -341
  52. data/lib/datadog/core/diagnostics/health.rb +4 -22
  53. data/lib/datadog/core/environment/variable_helpers.rb +58 -10
  54. data/lib/datadog/core/runtime/ext.rb +1 -1
  55. data/lib/datadog/core/utils.rb +0 -21
  56. data/lib/datadog/core.rb +21 -1
  57. data/lib/datadog/opentracer/distributed_headers.rb +7 -9
  58. data/lib/datadog/opentracer/rack_propagator.rb +0 -3
  59. data/lib/datadog/opentracer/text_map_propagator.rb +5 -7
  60. data/lib/datadog/profiling/collectors/cpu_and_wall_time.rb +10 -4
  61. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +20 -5
  62. data/lib/datadog/profiling/collectors/dynamic_sampling_rate.rb +14 -0
  63. data/lib/datadog/profiling/collectors/idle_sampling_helper.rb +68 -0
  64. data/lib/datadog/profiling/collectors/old_stack.rb +7 -0
  65. data/lib/datadog/profiling/exporter.rb +5 -0
  66. data/lib/datadog/profiling/old_recorder.rb +8 -0
  67. data/lib/datadog/profiling/profiler.rb +7 -0
  68. data/lib/datadog/profiling/scheduler.rb +4 -7
  69. data/lib/datadog/profiling/stack_recorder.rb +36 -0
  70. data/lib/datadog/profiling/tasks/setup.rb +0 -7
  71. data/lib/datadog/profiling.rb +2 -0
  72. data/lib/datadog/tracing/configuration/ext.rb +33 -3
  73. data/lib/datadog/tracing/configuration/settings.rb +433 -0
  74. data/lib/datadog/tracing/contrib/aws/configuration/settings.rb +4 -1
  75. data/lib/datadog/tracing/contrib/aws/ext.rb +1 -0
  76. data/lib/datadog/tracing/contrib/dalli/configuration/settings.rb +4 -1
  77. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  78. data/lib/datadog/tracing/contrib/delayed_job/plugin.rb +4 -0
  79. data/lib/datadog/tracing/contrib/elasticsearch/configuration/settings.rb +5 -1
  80. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +1 -0
  81. data/lib/datadog/tracing/contrib/ethon/configuration/settings.rb +6 -1
  82. data/lib/datadog/tracing/contrib/ethon/ext.rb +1 -0
  83. data/lib/datadog/tracing/contrib/excon/configuration/settings.rb +5 -1
  84. data/lib/datadog/tracing/contrib/excon/ext.rb +1 -0
  85. data/lib/datadog/tracing/contrib/faraday/configuration/settings.rb +5 -1
  86. data/lib/datadog/tracing/contrib/faraday/ext.rb +1 -0
  87. data/lib/datadog/tracing/contrib/grpc/configuration/settings.rb +6 -1
  88. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +2 -1
  89. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/server.rb +6 -12
  90. data/lib/datadog/tracing/contrib/grpc/distributed/fetcher.rb +27 -0
  91. data/lib/datadog/tracing/contrib/grpc/distributed/propagation.rb +43 -0
  92. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  93. data/lib/datadog/tracing/contrib/grpc/patcher.rb +0 -2
  94. data/lib/datadog/tracing/contrib/http/configuration/settings.rb +6 -1
  95. data/lib/datadog/tracing/contrib/http/distributed/fetcher.rb +32 -0
  96. data/lib/datadog/tracing/contrib/http/distributed/propagation.rb +38 -0
  97. data/lib/datadog/tracing/contrib/http/ext.rb +1 -0
  98. data/lib/datadog/tracing/contrib/httpclient/configuration/settings.rb +6 -1
  99. data/lib/datadog/tracing/contrib/httpclient/ext.rb +1 -0
  100. data/lib/datadog/tracing/contrib/httprb/configuration/settings.rb +6 -1
  101. data/lib/datadog/tracing/contrib/httprb/ext.rb +1 -0
  102. data/lib/datadog/tracing/contrib/kafka/consumer_event.rb +1 -0
  103. data/lib/datadog/tracing/contrib/kafka/events/produce_operation/send_messages.rb +1 -0
  104. data/lib/datadog/tracing/contrib/kafka/events/producer/deliver_messages.rb +1 -0
  105. data/lib/datadog/tracing/contrib/mongodb/configuration/settings.rb +5 -1
  106. data/lib/datadog/tracing/contrib/mongodb/ext.rb +1 -0
  107. data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +2 -0
  108. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +4 -1
  109. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  110. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +2 -2
  111. data/lib/datadog/tracing/contrib/patcher.rb +3 -2
  112. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +4 -1
  113. data/lib/datadog/tracing/contrib/pg/ext.rb +1 -0
  114. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +12 -2
  115. data/lib/datadog/tracing/contrib/presto/configuration/settings.rb +4 -1
  116. data/lib/datadog/tracing/contrib/presto/ext.rb +1 -0
  117. data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +1 -0
  118. data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +10 -12
  119. data/lib/datadog/tracing/contrib/que/tracer.rb +2 -0
  120. data/lib/datadog/tracing/contrib/racecar/events/batch.rb +4 -1
  121. data/lib/datadog/tracing/contrib/racecar/events/message.rb +4 -1
  122. data/lib/datadog/tracing/contrib/rack/middlewares.rb +2 -0
  123. data/lib/datadog/tracing/contrib/redis/configuration/settings.rb +4 -1
  124. data/lib/datadog/tracing/contrib/redis/ext.rb +1 -0
  125. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +30 -21
  126. data/lib/datadog/tracing/contrib/redis/integration.rb +34 -2
  127. data/lib/datadog/tracing/contrib/redis/patcher.rb +18 -14
  128. data/lib/datadog/tracing/contrib/redis/quantize.rb +12 -9
  129. data/lib/datadog/tracing/contrib/redis/tags.rb +4 -6
  130. data/lib/datadog/tracing/contrib/redis/trace_middleware.rb +72 -0
  131. data/lib/datadog/tracing/contrib/resque/resque_job.rb +2 -0
  132. data/lib/datadog/tracing/contrib/rest_client/configuration/settings.rb +6 -1
  133. data/lib/datadog/tracing/contrib/rest_client/ext.rb +1 -0
  134. data/lib/datadog/tracing/contrib/shoryuken/tracer.rb +2 -0
  135. data/lib/datadog/tracing/contrib/sidekiq/client_tracer.rb +5 -0
  136. data/lib/datadog/tracing/contrib/sidekiq/server_tracer.rb +5 -0
  137. data/lib/datadog/tracing/contrib/sneakers/tracer.rb +2 -0
  138. data/lib/datadog/{core → tracing}/diagnostics/ext.rb +1 -6
  139. data/lib/datadog/tracing/diagnostics/health.rb +40 -0
  140. data/lib/datadog/tracing/distributed/b3_multi.rb +66 -0
  141. data/lib/datadog/tracing/distributed/b3_single.rb +66 -0
  142. data/lib/datadog/tracing/distributed/datadog.rb +153 -0
  143. data/lib/datadog/tracing/distributed/datadog_tags_codec.rb +1 -0
  144. data/lib/datadog/tracing/distributed/fetcher.rb +30 -0
  145. data/lib/datadog/tracing/distributed/headers/ext.rb +18 -16
  146. data/lib/datadog/tracing/distributed/helpers.rb +9 -7
  147. data/lib/datadog/tracing/distributed/none.rb +19 -0
  148. data/lib/datadog/tracing/distributed/propagation.rb +127 -0
  149. data/lib/datadog/tracing/distributed/trace_context.rb +369 -0
  150. data/lib/datadog/tracing/metadata/ext.rb +1 -1
  151. data/lib/datadog/tracing/propagation/http.rb +3 -106
  152. data/lib/datadog/tracing/sampling/priority_sampler.rb +11 -0
  153. data/lib/datadog/tracing/sampling/rate_sampler.rb +3 -3
  154. data/lib/datadog/tracing/span.rb +3 -19
  155. data/lib/datadog/tracing/span_operation.rb +5 -4
  156. data/lib/datadog/tracing/trace_digest.rb +75 -2
  157. data/lib/datadog/tracing/trace_operation.rb +5 -4
  158. data/lib/datadog/tracing/trace_segment.rb +1 -1
  159. data/lib/datadog/tracing/utils.rb +50 -0
  160. data/lib/ddtrace/transport/trace_formatter.rb +2 -5
  161. data/lib/ddtrace/version.rb +2 -2
  162. metadata +35 -15
  163. data/lib/datadog/tracing/distributed/headers/b3.rb +0 -55
  164. data/lib/datadog/tracing/distributed/headers/b3_single.rb +0 -67
  165. data/lib/datadog/tracing/distributed/headers/datadog.rb +0 -144
  166. data/lib/datadog/tracing/distributed/headers/parser.rb +0 -37
  167. data/lib/datadog/tracing/distributed/metadata/b3.rb +0 -55
  168. data/lib/datadog/tracing/distributed/metadata/b3_single.rb +0 -66
  169. data/lib/datadog/tracing/distributed/metadata/datadog.rb +0 -73
  170. data/lib/datadog/tracing/distributed/metadata/parser.rb +0 -34
  171. data/lib/datadog/tracing/propagation/grpc.rb +0 -98
@@ -3,11 +3,18 @@
3
3
  #include <ruby/thread_native.h>
4
4
  #include <ruby/debug.h>
5
5
  #include <stdbool.h>
6
+ #include <stdatomic.h>
6
7
  #include <signal.h>
8
+ #include <errno.h>
9
+
7
10
  #include "helpers.h"
8
11
  #include "ruby_helpers.h"
9
12
  #include "collectors_cpu_and_wall_time.h"
13
+ #include "collectors_dynamic_sampling_rate.h"
14
+ #include "collectors_idle_sampling_helper.h"
10
15
  #include "private_vm_api_access.h"
16
+ #include "setup_signal_handler.h"
17
+ #include "time_helpers.h"
11
18
 
12
19
  // Used to trigger the periodic execution of Collectors::CpuAndWallTime, which implements all of the sampling logic
13
20
  // itself; this class only implements the "doing it periodically" part.
@@ -29,7 +36,7 @@
29
36
  // internals, we may be able to figure out a way of overcoming it. But it's definitely going to be hard so for now
30
37
  // we're considering it as a given.
31
38
  //
32
- // ### Flow for triggering samples
39
+ // ### Flow for triggering CPU/Wall-time samples
33
40
  //
34
41
  // The flow for triggering samples is as follows:
35
42
  //
@@ -56,23 +63,56 @@
56
63
  // 4. The Ruby VM calls our `sample_from_postponed_job` from a thread holding the global VM lock. A sample is recorded by
57
64
  // calling `cpu_and_wall_time_collector_sample`.
58
65
  //
66
+ // ### TracePoints and Forking
67
+ //
68
+ // When the Ruby VM forks, the CPU/Wall-time profiling stops naturally because it's triggered by a background thread
69
+ // that doesn't get automatically restarted by the VM on the child process. (The profiler does trigger its restart at
70
+ // some point -- see `Profiling::Tasks::Setup` for details).
71
+ //
72
+ // But this doesn't apply to any `TracePoint`s this class may use, which will continue to be active. Thus, we need to
73
+ // always remember consider this case of -- the worker thread may not be alive but the `TracePoint`s can continue to
74
+ // trigger samples.
75
+ //
59
76
  // ---
60
77
 
61
78
  // Contains state for a single CpuAndWallTimeWorker instance
62
79
  struct cpu_and_wall_time_worker_state {
63
- // Important: This is not atomic nor is it guaranteed to replace memory barriers and the like. Aka this works for
64
- // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
65
- // atomic operations. stdatomic.h seems a nice thing to reach out for.
66
- volatile bool should_run;
80
+ atomic_bool should_run;
81
+
67
82
  bool gc_profiling_enabled;
83
+ VALUE self_instance;
68
84
  VALUE cpu_and_wall_time_collector_instance;
85
+ VALUE idle_sampling_helper_instance;
86
+ VALUE owner_thread;
87
+ dynamic_sampling_rate_state dynamic_sampling_rate;
69
88
 
70
89
  // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
71
90
  // the CpuAndWallTimeWorker thread
72
91
  VALUE failure_exception;
92
+ // Used by `_native_stop` to flag the worker thread to start (see comment on `_native_sampling_loop`)
93
+ VALUE stop_thread;
73
94
 
74
95
  // Used to get gc start/finish information
75
96
  VALUE gc_tracepoint;
97
+
98
+ struct stats {
99
+ // How many times we tried to trigger a sample
100
+ unsigned int trigger_sample_attempts;
101
+ // How many times we tried to simulate signal delivery
102
+ unsigned int trigger_simulated_signal_delivery_attempts;
103
+ // How many times we actually simulated signal delivery
104
+ unsigned int simulated_signal_delivery;
105
+ // How many times we actually called rb_postponed_job_register_one from a signal handler
106
+ unsigned int signal_handler_enqueued_sample;
107
+ // How many times the signal handler was called from the wrong thread
108
+ unsigned int signal_handler_wrong_thread;
109
+ // How many times we actually sampled (except GC samples)
110
+ unsigned int sampled;
111
+ // Min/max/total wall-time spent sampling (except GC samples)
112
+ uint64_t sampling_time_ns_min;
113
+ uint64_t sampling_time_ns_max;
114
+ uint64_t sampling_time_ns_total;
115
+ } stats;
76
116
  };
77
117
 
78
118
  static VALUE _native_new(VALUE klass);
@@ -80,19 +120,18 @@ static VALUE _native_initialize(
80
120
  DDTRACE_UNUSED VALUE _self,
81
121
  VALUE self_instance,
82
122
  VALUE cpu_and_wall_time_collector_instance,
83
- VALUE gc_profiling_enabled
123
+ VALUE gc_profiling_enabled,
124
+ VALUE idle_sampling_helper_instance
84
125
  );
85
126
  static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
86
127
  static VALUE _native_sampling_loop(VALUE self, VALUE instance);
87
- static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
128
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread);
88
129
  static VALUE stop(VALUE self_instance, VALUE optional_exception);
89
- static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
90
- static void remove_sigprof_signal_handler(void);
91
- static void block_sigprof_signal_handler_from_running_in_current_thread(void);
92
130
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
93
131
  static void *run_sampling_trigger_loop(void *state_ptr);
94
132
  static void interrupt_sampling_trigger_loop(void *state_ptr);
95
133
  static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused);
134
+ static VALUE rescued_sample_from_postponed_job(VALUE self_instance);
96
135
  static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception);
97
136
  static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self);
98
137
  static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance);
@@ -104,22 +143,30 @@ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
104
143
  static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
105
144
  static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
106
145
  static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
107
- static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
146
+ static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
108
147
  static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
109
148
  static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
110
-
111
- // Global state -- be very careful when accessing or modifying it
112
-
113
- // Note: Global state must only be mutated while holding the global VM lock (we piggy back on it to ensure correctness).
114
- // The active_sampler_instance needs to be global because we access it from the signal handler.
149
+ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance);
150
+ static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self);
151
+ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance);
152
+ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused);
153
+ static void grab_gvl_and_sample(void);
154
+ static void reset_stats(struct cpu_and_wall_time_worker_state *state);
155
+ static void sleep_for(uint64_t time_ns);
156
+
157
+ // Note on sampler global state safety:
158
+ //
159
+ // Both `active_sampler_instance` and `active_sampler_instance_state` are **GLOBAL** state. Be careful when accessing
160
+ // or modifying them.
161
+ // In particular, it's important to only mutate them while holding the global VM lock, to ensure correctness.
162
+ //
163
+ // This global state is needed because a bunch of functions on this file need to access it from situations
164
+ // (e.g. signal handler) where it's impossible or just awkward to pass it as an argument.
115
165
  static VALUE active_sampler_instance = Qnil;
116
- // ...We also store active_sampler_owner_thread to be able to tell who the active_sampler_instance belongs to (and also
117
- // to detect when it is outdated)
118
- static VALUE active_sampler_owner_thread = Qnil;
166
+ struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;
119
167
 
120
168
  void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
121
169
  rb_global_variable(&active_sampler_instance);
122
- rb_global_variable(&active_sampler_owner_thread);
123
170
 
124
171
  VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
125
172
  VALUE collectors_cpu_and_wall_time_worker_class = rb_define_class_under(collectors_module, "CpuAndWallTimeWorker", rb_cObject);
@@ -136,9 +183,11 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
136
183
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
137
184
  rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
138
185
 
139
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
186
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 4);
140
187
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
141
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
188
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
189
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
190
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stats", _native_stats, 1);
142
191
  rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
143
192
  rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
144
193
  rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
@@ -147,6 +196,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
147
196
  rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
148
197
  rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
149
198
  rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
199
+ rb_define_singleton_method(testing_module, "_native_is_sigprof_blocked_in_current_thread", _native_is_sigprof_blocked_in_current_thread, 0);
150
200
  }
151
201
 
152
202
  // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
@@ -156,7 +206,7 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
156
206
  .function = {
157
207
  .dmark = cpu_and_wall_time_worker_typed_data_mark,
158
208
  .dfree = RUBY_DEFAULT_FREE,
159
- .dsize = NULL, // We don't track profile memory usage (although it'd be cool if we did!)
209
+ .dsize = NULL, // We don't track memory usage (although it'd be cool if we did!)
160
210
  //.dcompact = NULL, // FIXME: Add support for compaction
161
211
  },
162
212
  .flags = RUBY_TYPED_FREE_IMMEDIATELY
@@ -165,20 +215,26 @@ static const rb_data_type_t cpu_and_wall_time_worker_typed_data = {
165
215
  static VALUE _native_new(VALUE klass) {
166
216
  struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
167
217
 
168
- state->should_run = false;
218
+ atomic_init(&state->should_run, false);
169
219
  state->gc_profiling_enabled = false;
170
220
  state->cpu_and_wall_time_collector_instance = Qnil;
221
+ state->idle_sampling_helper_instance = Qnil;
222
+ state->owner_thread = Qnil;
223
+ dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
171
224
  state->failure_exception = Qnil;
225
+ state->stop_thread = Qnil;
172
226
  state->gc_tracepoint = Qnil;
227
+ reset_stats(state);
173
228
 
174
- return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
229
+ return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
175
230
  }
176
231
 
177
232
  static VALUE _native_initialize(
178
233
  DDTRACE_UNUSED VALUE _self,
179
234
  VALUE self_instance,
180
235
  VALUE cpu_and_wall_time_collector_instance,
181
- VALUE gc_profiling_enabled
236
+ VALUE gc_profiling_enabled,
237
+ VALUE idle_sampling_helper_instance
182
238
  ) {
183
239
  ENFORCE_BOOLEAN(gc_profiling_enabled);
184
240
 
@@ -187,6 +243,7 @@ static VALUE _native_initialize(
187
243
 
188
244
  state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
189
245
  state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
246
+ state->idle_sampling_helper_instance = idle_sampling_helper_instance;
190
247
  state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
191
248
 
192
249
  return Qtrue;
@@ -197,7 +254,10 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
197
254
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
198
255
 
199
256
  rb_gc_mark(state->cpu_and_wall_time_collector_instance);
257
+ rb_gc_mark(state->idle_sampling_helper_instance);
258
+ rb_gc_mark(state->owner_thread);
200
259
  rb_gc_mark(state->failure_exception);
260
+ rb_gc_mark(state->stop_thread);
201
261
  rb_gc_mark(state->gc_tracepoint);
202
262
  }
203
263
 
@@ -206,8 +266,9 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
206
266
  struct cpu_and_wall_time_worker_state *state;
207
267
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
208
268
 
209
- if (active_sampler_owner_thread != Qnil) {
210
- if (is_thread_alive(active_sampler_owner_thread)) {
269
+ struct cpu_and_wall_time_worker_state *old_state = active_sampler_instance_state;
270
+ if (old_state != NULL) {
271
+ if (is_thread_alive(old_state->owner_thread)) {
211
272
  rb_raise(
212
273
  rb_eRuntimeError,
213
274
  "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
@@ -221,23 +282,26 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
221
282
  // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
222
283
  // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
223
284
 
224
- struct cpu_and_wall_time_worker_state *old_state;
225
- TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
226
285
  rb_tracepoint_disable(old_state->gc_tracepoint);
227
286
  }
228
287
  }
229
288
 
289
+ // We use `stop_thread` to distinguish when `_native_stop` was called before we actually had a chance to start. In this
290
+ // situation we stop immediately and never even start the sampling trigger loop.
291
+ if (state->stop_thread == rb_thread_current()) return Qnil;
292
+
293
+ // Reset the dynamic sampling rate state, if any (reminder: the monotonic clock reference may change after a fork)
294
+ dynamic_sampling_rate_reset(&state->dynamic_sampling_rate);
295
+
230
296
  // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
297
+ active_sampler_instance_state = state;
231
298
  active_sampler_instance = instance;
232
- active_sampler_owner_thread = rb_thread_current();
299
+ state->owner_thread = rb_thread_current();
233
300
 
234
- state->should_run = true;
301
+ atomic_store(&state->should_run, true);
235
302
 
236
303
  block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
237
304
 
238
- install_sigprof_signal_handler(handle_sampling_signal);
239
- if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
240
-
241
305
  // Release GVL, get to the actual work!
242
306
  int exception_state;
243
307
  rb_protect(release_gvl_and_run_sampling_trigger_loop, instance, &exception_state);
@@ -245,9 +309,32 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
245
309
  // The sample trigger loop finished (either cleanly or with an error); let's clean up
246
310
 
247
311
  rb_tracepoint_disable(state->gc_tracepoint);
248
- remove_sigprof_signal_handler();
312
+
313
+ active_sampler_instance_state = NULL;
249
314
  active_sampler_instance = Qnil;
250
- active_sampler_owner_thread = Qnil;
315
+ state->owner_thread = Qnil;
316
+
317
+ // If this `Thread` is about to die, why is this important? It's because Ruby caches native threads for a period after
318
+ // the `Thread` dies, and reuses them if a new Ruby `Thread` gets created. This means that while conceptually the
319
+ // worker background `Thread` is about to die, the low-level native OS thread can be reused for something else in the Ruby app.
320
+ // Then, the reused thread would "inherit" the SIGPROF blocking, which is... really unexpected.
321
+ // This actually caused a flaky test -- the `native_extension_spec.rb` creates a `Thread` and tries to specifically
322
+ // send SIGPROF signals to it, and oops it could fail if it got the reused native thread from the worker which still
323
+ // had SIGPROF delivery blocked. :hide_the_pain_harold:
324
+ unblock_sigprof_signal_handler_from_running_in_current_thread();
325
+
326
+ // Why replace and not use remove the signal handler? We do this because when a process receives a SIGPROF without
327
+ // having an explicit signal handler set up, the process will instantly terminate with a confusing
328
+ // "Profiling timer expired" message left behind. (This message doesn't come from us -- it's the default message for
329
+ // an unhandled SIGPROF. Pretty confusing UNIX/POSIX behavior...)
330
+ //
331
+ // Unfortunately, because signal delivery is asynchronous, there's no way to guarantee that there are no pending
332
+ // profiler-sent signals by the time we get here and want to clean up.
333
+ // @ivoanjo: I suspect this will never happen, but the cost of getting it wrong is really high (VM terminates) so this
334
+ // is a just-in-case situation.
335
+ //
336
+ // Note 2: This can raise exceptions as well, so make sure that all cleanups are done by the time we get here.
337
+ replace_sigprof_signal_handler_with_empty_handler(handle_sampling_signal);
251
338
 
252
339
  // Ensure that instance is not garbage collected while the native sampling loop is running; this is probably not needed, but just in case
253
340
  RB_GC_GUARD(instance);
@@ -257,7 +344,12 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
257
344
  return Qnil;
258
345
  }
259
346
 
260
- static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
347
+ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE worker_thread) {
348
+ struct cpu_and_wall_time_worker_state *state;
349
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
350
+
351
+ state->stop_thread = worker_thread;
352
+
261
353
  return stop(self_instance, /* optional_exception: */ Qnil);
262
354
  }
263
355
 
@@ -265,7 +357,7 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
265
357
  struct cpu_and_wall_time_worker_state *state;
266
358
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
267
359
 
268
- state->should_run = false;
360
+ atomic_store(&state->should_run, false);
269
361
  state->failure_exception = optional_exception;
270
362
 
271
363
  // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
@@ -274,92 +366,80 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
274
366
  return Qtrue;
275
367
  }
276
368
 
277
- static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)) {
278
- struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
279
- struct sigaction signal_handler_config = {
280
- .sa_flags = SA_RESTART | SA_SIGINFO,
281
- .sa_sigaction = signal_handler_function
282
- };
283
- sigemptyset(&signal_handler_config.sa_mask);
284
-
285
- if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) {
286
- rb_sys_fail("Could not start CpuAndWallTimeWorker: Could not install signal handler");
287
- }
288
-
289
- // In some corner cases (e.g. after a fork), our signal handler may still be around, and that's ok
290
- if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) return;
291
-
292
- if (existing_signal_handler_config.sa_handler != NULL || existing_signal_handler_config.sa_sigaction != NULL) {
293
- // A previous signal handler already existed. Currently we don't support this situation, so let's just back out
294
- // of the installation.
295
-
296
- if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) {
297
- rb_sys_fail(
298
- "Could not start CpuAndWallTimeWorker: Could not re-install pre-existing SIGPROF signal handler. " \
299
- "This may break the component had installed it."
300
- );
301
- }
302
-
303
- rb_raise(rb_eRuntimeError, "Could not start CpuAndWallTimeWorker: There's a pre-existing SIGPROF signal handler");
304
- }
305
- }
306
-
307
- static void remove_sigprof_signal_handler(void) {
308
- struct sigaction signal_handler_config = {
309
- .sa_handler = SIG_DFL, // Reset back to default
310
- .sa_flags = SA_RESTART // TODO: Unclear if this is actually needed/does anything at all
311
- };
312
- sigemptyset(&signal_handler_config.sa_mask);
313
-
314
- if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
315
- }
316
-
317
- static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
318
- sigset_t signals_to_block;
319
- sigemptyset(&signals_to_block);
320
- sigaddset(&signals_to_block, SIGPROF);
321
- pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
322
- }
323
-
324
369
  // NOTE: Remember that this will run in the thread and within the scope of user code, including user C code.
325
370
  // We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
326
371
  // we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
327
372
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
328
- if (!ruby_thread_has_gvl_p()) {
329
- return; // Not safe to enqueue a sample from this thread
330
- }
331
- if (!ddtrace_rb_ractor_main_p()) {
332
- return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
373
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
374
+
375
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the signal delivery was happening; nothing to do
376
+ if (state == NULL) return;
377
+
378
+ if (
379
+ !ruby_native_thread_p() || // Not a Ruby thread
380
+ !is_current_thread_holding_the_gvl() || // Not safe to enqueue a sample from this thread
381
+ !ddtrace_rb_ractor_main_p() // We're not on the main Ractor; we currently don't support profiling non-main Ractors
382
+ ) {
383
+ state->stats.signal_handler_wrong_thread++;
384
+ return;
333
385
  }
334
386
 
335
387
  // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
336
388
  // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
337
389
  // b) we validate we are in the thread that has the global VM lock; if a different thread gets a signal, it will return early
338
390
  // because it will not have the global VM lock
339
- // TODO: Validate that this does not impact Ractors
340
391
 
341
392
  // Note: rb_postponed_job_register_one ensures that if there's a previous sample_from_postponed_job queued for execution
342
393
  // then we will not queue a second one. It does this by doing a linear scan on the existing jobs; in the future we
343
394
  // may want to implement that check ourselves.
344
395
 
345
- // TODO: Do something with result (potentially update tracking counters?)
396
+ state->stats.signal_handler_enqueued_sample++;
397
+
398
+ // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
399
+ // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
346
400
  /*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
401
+ // TODO: Do something with result (potentially update tracking counters?)
347
402
  }
348
403
 
349
404
  // The actual sampling trigger loop always runs **without** the global vm lock.
350
405
  static void *run_sampling_trigger_loop(void *state_ptr) {
351
406
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
352
407
 
353
- struct timespec time_between_signals = {.tv_nsec = 10 * 1000 * 1000 /* 10ms */};
408
+ uint64_t minimum_time_between_signals = MILLIS_AS_NS(10);
409
+
410
+ while (atomic_load(&state->should_run)) {
411
+ state->stats.trigger_sample_attempts++;
354
412
 
355
- while (state->should_run) {
356
413
  // TODO: This is still a placeholder for a more complex mechanism. In particular:
357
- // * We want to signal a particular thread or threads, not the process in general
358
- // * We want to track if a signal landed on the thread holding the global VM lock and do something about it
359
414
  // * We want to do more than having a fixed sampling rate
360
415
 
361
- kill(getpid(), SIGPROF);
362
- nanosleep(&time_between_signals, NULL);
416
+ current_gvl_owner owner = gvl_owner();
417
+ if (owner.valid) {
418
+ // Note that reading the GVL owner and sending them a signal is a race -- the Ruby VM keeps on executing while
419
+ // we're doing this, so we may still not signal the correct thread from time to time, but our signal handler
420
+ // includes a check to see if it got called in the right thread
421
+ pthread_kill(owner.owner, SIGPROF);
422
+ } else {
423
+ // If no thread owns the Global VM Lock, the application is probably idle at the moment. We still want to sample
424
+ // so we "ask a friend" (the IdleSamplingHelper component) to grab the GVL and simulate getting a SIGPROF.
425
+ //
426
+ // In a previous version of the code, we called `grab_gvl_and_sample` directly BUT this was problematic because
427
+ // Ruby may concurrently get busy and so the CpuAndWallTimeWorker would be blocked in line to acquire the GVL
428
+ // for an uncontrolled amount of time. (This can still happen to the IdleSamplingHelper, but the
429
+ // CpuAndWallTimeWorker will still be free to interrupt the Ruby VM and keep sampling for the entire blocking period).
430
+ state->stats.trigger_simulated_signal_delivery_attempts++;
431
+ idle_sampling_helper_request_action(state->idle_sampling_helper_instance, grab_gvl_and_sample);
432
+ }
433
+
434
+ sleep_for(minimum_time_between_signals);
435
+
436
+ // The dynamic sampling rate module keeps track of how long samples are taking, and in here we extend our sleep time
437
+ // to take that into account.
438
+ // Note that we deliberately should NOT combine this sleep_for with the one above because the result of
439
+ // `dynamic_sampling_rate_get_sleep` may have changed while the above sleep was ongoing.
440
+ uint64_t extra_sleep =
441
+ dynamic_sampling_rate_get_sleep(&state->dynamic_sampling_rate, monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE));
442
+ if (extra_sleep > 0) sleep_for(extra_sleep);
363
443
  }
364
444
 
365
445
  return NULL; // Unused
@@ -369,14 +449,14 @@ static void *run_sampling_trigger_loop(void *state_ptr) {
369
449
  static void interrupt_sampling_trigger_loop(void *state_ptr) {
370
450
  struct cpu_and_wall_time_worker_state *state = (struct cpu_and_wall_time_worker_state *) state_ptr;
371
451
 
372
- state->should_run = false;
452
+ atomic_store(&state->should_run, false);
373
453
  }
374
454
 
375
455
  static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
376
- VALUE instance = active_sampler_instance; // Read from global variable
456
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
377
457
 
378
458
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
379
- if (instance == Qnil) return;
459
+ if (state == NULL) return;
380
460
 
381
461
  // @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
382
462
  // it's running on the main Ractor, but just in case...
@@ -384,14 +464,45 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
384
464
  return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
385
465
  }
386
466
 
467
+ // Rescue against any exceptions that happen during sampling
468
+ safely_call(rescued_sample_from_postponed_job, state->self_instance, state->self_instance);
469
+ }
470
+
471
+ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {
387
472
  struct cpu_and_wall_time_worker_state *state;
388
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
473
+ TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
389
474
 
390
- // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
391
- safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
475
+ long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
476
+
477
+ if (!dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
478
+ // TODO: Add a counter for this
479
+ return Qnil;
480
+ }
481
+
482
+ state->stats.sampled++;
483
+
484
+ cpu_and_wall_time_collector_sample(state->cpu_and_wall_time_collector_instance, wall_time_ns_before_sample);
485
+
486
+ long wall_time_ns_after_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);
487
+ long delta_ns = wall_time_ns_after_sample - wall_time_ns_before_sample;
488
+
489
+ // Guard against wall-time going backwards, see https://github.com/DataDog/dd-trace-rb/pull/2336 for discussion.
490
+ uint64_t sampling_time_ns = delta_ns < 0 ? 0 : delta_ns;
491
+
492
+ state->stats.sampling_time_ns_min = uint64_min_of(sampling_time_ns, state->stats.sampling_time_ns_min);
493
+ state->stats.sampling_time_ns_max = uint64_max_of(sampling_time_ns, state->stats.sampling_time_ns_max);
494
+ state->stats.sampling_time_ns_total += sampling_time_ns;
495
+
496
+ dynamic_sampling_rate_after_sample(&state->dynamic_sampling_rate, wall_time_ns_after_sample, sampling_time_ns);
497
+
498
+ // Return a dummy VALUE because we're called from rb_rescue2 which requires it
499
+ return Qnil;
392
500
  }
393
501
 
394
- static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
502
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
503
+ stop(self_instance, exception);
504
+ return Qnil;
505
+ }
395
506
 
396
507
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
397
508
  // It SHOULD NOT be used for other purposes.
@@ -403,6 +514,8 @@ static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
403
514
 
404
515
  if (existing_signal_handler_config.sa_sigaction == handle_sampling_signal) {
405
516
  return ID2SYM(rb_intern("profiling"));
517
+ } else if (existing_signal_handler_config.sa_sigaction == empty_signal_handler) {
518
+ return ID2SYM(rb_intern("empty"));
406
519
  } else if (existing_signal_handler_config.sa_sigaction != NULL) {
407
520
  return ID2SYM(rb_intern("other"));
408
521
  } else {
@@ -414,6 +527,11 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
414
527
  struct cpu_and_wall_time_worker_state *state;
415
528
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
416
529
 
530
+ // Final preparations: Setup signal handler and enable tracepoint. We run these here and not in `_native_sampling_loop`
531
+ // because they may raise exceptions.
532
+ install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
533
+ if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
534
+
417
535
  rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);
418
536
 
419
537
  // If we stopped sampling due to an exception, re-raise it (now in the worker thread)
@@ -425,9 +543,9 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
425
543
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
426
544
  // It SHOULD NOT be used for other purposes.
427
545
  static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
428
- return \
429
- (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
430
- Qtrue : Qfalse;
546
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
547
+
548
+ return (state != NULL && is_thread_alive(state->owner_thread) && state->self_instance == instance) ? Qtrue : Qfalse;
431
549
  }
432
550
 
433
551
  static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
@@ -437,7 +555,7 @@ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
437
555
  // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
438
556
  // It SHOULD NOT be used for other purposes.
439
557
  static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
440
- install_sigprof_signal_handler(testing_signal_handler);
558
+ install_sigprof_signal_handler(testing_signal_handler, "testing_signal_handler");
441
559
  return Qtrue;
442
560
  }
443
561
 
@@ -485,16 +603,11 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
485
603
  int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
486
604
  if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
487
605
 
488
- VALUE instance = active_sampler_instance; // Read from global variable
606
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
489
607
 
490
608
  // This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
491
609
  // and disabled before it is cleared, but just in case...
492
- if (instance == Qnil) return;
493
-
494
- struct cpu_and_wall_time_worker_state *state;
495
- if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
496
- // This should never fail the the above check passes
497
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
610
+ if (state == NULL) return;
498
611
 
499
612
  if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
500
613
  cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
@@ -517,15 +630,18 @@ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
517
630
  cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
518
631
  // We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
519
632
  // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
633
+ //
634
+ // Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
635
+ // this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
520
636
  rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
521
637
  }
522
638
  }
523
639
 
524
640
  static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
525
- VALUE instance = active_sampler_instance; // Read from global variable
641
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
526
642
 
527
643
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
528
- if (instance == Qnil) return;
644
+ if (state == NULL) return;
529
645
 
530
646
  // @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
531
647
  // it's running on the main Ractor, but just in case...
@@ -533,18 +649,15 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
533
649
  return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
534
650
  }
535
651
 
536
- struct cpu_and_wall_time_worker_state *state;
537
- TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
538
-
539
652
  // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
540
- safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, instance);
653
+ safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, state->self_instance);
541
654
  }
542
655
 
543
656
  // Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
544
657
  // exception gets raised within
545
- static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
658
+ static VALUE safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
546
659
  VALUE exception_handler_function_arg = instance;
547
- rb_rescue2(
660
+ return rb_rescue2(
548
661
  function_to_call_safely,
549
662
  function_to_call_safely_arg,
550
663
  handle_sampling_failure,
@@ -567,3 +680,97 @@ static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE sel
567
680
  sample_from_postponed_job(NULL);
568
681
  return Qtrue;
569
682
  }
683
+
684
+ // After the Ruby VM forks, this method gets called in the child process to clean up any leftover state from the parent.
685
+ //
686
+ // Assumption: This method gets called BEFORE restarting profiling. Note that profiling-related tracepoints may still
687
+ // be active, so we make sure to disable them before calling into anything else, so that there are no components
688
+ // attempting to trigger samples at the same time as the reset is done.
689
+ //
690
+ // In the future, if we add more other components with tracepoints, we will need to coordinate stopping all such
691
+ // tracepoints before doing the other cleaning steps.
692
+ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance) {
693
+ struct cpu_and_wall_time_worker_state *state;
694
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
695
+
696
+ // Disable all tracepoints, so that there are no more attempts to mutate the profile
697
+ rb_tracepoint_disable(state->gc_tracepoint);
698
+
699
+ reset_stats(state);
700
+
701
+ // Remove all state from the `Collectors::CpuAndWallTime` and connected downstream components
702
+ rb_funcall(state->cpu_and_wall_time_collector_instance, rb_intern("reset_after_fork"), 0);
703
+
704
+ return Qtrue;
705
+ }
706
+
707
+ static VALUE _native_is_sigprof_blocked_in_current_thread(DDTRACE_UNUSED VALUE self) {
708
+ return is_sigprof_blocked_in_current_thread();
709
+ }
710
+
711
+ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
712
+ struct cpu_and_wall_time_worker_state *state;
713
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
714
+
715
+ VALUE pretty_sampling_time_ns_min = state->stats.sampling_time_ns_min == UINT64_MAX ? Qnil : ULL2NUM(state->stats.sampling_time_ns_min);
716
+ VALUE pretty_sampling_time_ns_max = state->stats.sampling_time_ns_max == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_max);
717
+ VALUE pretty_sampling_time_ns_total = state->stats.sampling_time_ns_total == 0 ? Qnil : ULL2NUM(state->stats.sampling_time_ns_total);
718
+ VALUE pretty_sampling_time_ns_avg =
719
+ state->stats.sampled == 0 ? Qnil : DBL2NUM(((double) state->stats.sampling_time_ns_total) / state->stats.sampled);
720
+
721
+ VALUE stats_as_hash = rb_hash_new();
722
+ VALUE arguments[] = {
723
+ ID2SYM(rb_intern("trigger_sample_attempts")), /* => */ UINT2NUM(state->stats.trigger_sample_attempts),
724
+ ID2SYM(rb_intern("trigger_simulated_signal_delivery_attempts")), /* => */ UINT2NUM(state->stats.trigger_simulated_signal_delivery_attempts),
725
+ ID2SYM(rb_intern("simulated_signal_delivery")), /* => */ UINT2NUM(state->stats.simulated_signal_delivery),
726
+ ID2SYM(rb_intern("signal_handler_enqueued_sample")), /* => */ UINT2NUM(state->stats.signal_handler_enqueued_sample),
727
+ ID2SYM(rb_intern("signal_handler_wrong_thread")), /* => */ UINT2NUM(state->stats.signal_handler_wrong_thread),
728
+ ID2SYM(rb_intern("sampled")), /* => */ UINT2NUM(state->stats.sampled),
729
+ ID2SYM(rb_intern("sampling_time_ns_min")), /* => */ pretty_sampling_time_ns_min,
730
+ ID2SYM(rb_intern("sampling_time_ns_max")), /* => */ pretty_sampling_time_ns_max,
731
+ ID2SYM(rb_intern("sampling_time_ns_total")), /* => */ pretty_sampling_time_ns_total,
732
+ ID2SYM(rb_intern("sampling_time_ns_avg")), /* => */ pretty_sampling_time_ns_avg,
733
+ };
734
+ for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(stats_as_hash, arguments[i], arguments[i+1]);
735
+ return stats_as_hash;
736
+ }
737
+
738
+ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused) {
739
+ struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above
740
+
741
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the IdleSamplingHelper was trying to execute this action
742
+ if (state == NULL) return NULL;
743
+
744
+ state->stats.simulated_signal_delivery++;
745
+
746
+ // @ivoanjo: We could instead directly call sample_from_postponed_job, but I chose to go through the signal handler
747
+ // so that the simulated case is as close to the original one as well (including any metrics increases, etc).
748
+ handle_sampling_signal(0, NULL, NULL);
749
+
750
+ return NULL; // Unused
751
+ }
752
+
753
+ static void grab_gvl_and_sample(void) { rb_thread_call_with_gvl(simulate_sampling_signal_delivery, NULL); }
754
+
755
+ static void reset_stats(struct cpu_and_wall_time_worker_state *state) {
756
+ state->stats = (struct stats) {}; // Resets all stats back to zero
757
+ state->stats.sampling_time_ns_min = UINT64_MAX; // Since we always take the min between existing and latest sample
758
+ }
759
+
760
+ static void sleep_for(uint64_t time_ns) {
761
+ // As a simplification, we currently only support setting .tv_nsec
762
+ if (time_ns >= SECONDS_AS_NS(1)) {
763
+ grab_gvl_and_raise(rb_eArgError, "sleep_for can only sleep for less than 1 second, time_ns: %"PRIu64, time_ns);
764
+ }
765
+
766
+ struct timespec time_to_sleep = {.tv_nsec = time_ns};
767
+
768
+ while (nanosleep(&time_to_sleep, &time_to_sleep) != 0) {
769
+ if (errno == EINTR) {
770
+ // We were interrupted. nanosleep updates "time_to_sleep" to contain only the remaining time, so we just let the
771
+ // loop keep going.
772
+ } else {
773
+ ENFORCE_SUCCESS_NO_GVL(errno);
774
+ }
775
+ }
776
+ }