ddtrace 1.18.0 → 1.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +82 -1
  3. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +96 -66
  4. data/ext/ddtrace_profiling_native_extension/collectors_discrete_dynamic_sampler.c +349 -0
  5. data/ext/ddtrace_profiling_native_extension/collectors_discrete_dynamic_sampler.h +89 -0
  6. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c +22 -14
  7. data/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h +4 -0
  8. data/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.c +156 -0
  9. data/ext/ddtrace_profiling_native_extension/collectors_gc_profiling_helper.h +5 -0
  10. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +43 -102
  11. data/ext/ddtrace_profiling_native_extension/collectors_stack.h +10 -3
  12. data/ext/ddtrace_profiling_native_extension/collectors_thread_context.c +159 -124
  13. data/ext/ddtrace_profiling_native_extension/collectors_thread_context.h +2 -1
  14. data/ext/ddtrace_profiling_native_extension/extconf.rb +19 -0
  15. data/ext/ddtrace_profiling_native_extension/heap_recorder.c +970 -0
  16. data/ext/ddtrace_profiling_native_extension/heap_recorder.h +155 -0
  17. data/ext/ddtrace_profiling_native_extension/helpers.h +6 -0
  18. data/ext/ddtrace_profiling_native_extension/libdatadog_helpers.c +20 -0
  19. data/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h +11 -0
  20. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +5 -0
  21. data/ext/ddtrace_profiling_native_extension/profiling.c +17 -0
  22. data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +147 -0
  23. data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +28 -0
  24. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +329 -10
  25. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +3 -0
  26. data/ext/ddtrace_profiling_native_extension/time_helpers.h +2 -0
  27. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +2 -1
  28. data/lib/datadog/core/configuration/settings.rb +153 -21
  29. data/lib/datadog/core/environment/class_count.rb +6 -6
  30. data/lib/datadog/core/remote/component.rb +25 -12
  31. data/lib/datadog/core/remote/ext.rb +1 -0
  32. data/lib/datadog/core/remote/tie/tracing.rb +39 -0
  33. data/lib/datadog/core/remote/tie.rb +27 -0
  34. data/lib/datadog/core/telemetry/collector.rb +10 -0
  35. data/lib/datadog/core/telemetry/event.rb +2 -1
  36. data/lib/datadog/core/telemetry/ext.rb +3 -0
  37. data/lib/datadog/core/telemetry/v1/app_event.rb +8 -1
  38. data/lib/datadog/core/telemetry/v1/install_signature.rb +38 -0
  39. data/lib/datadog/opentelemetry/sdk/propagator.rb +3 -2
  40. data/lib/datadog/opentelemetry.rb +3 -0
  41. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +5 -12
  42. data/lib/datadog/profiling/component.rb +183 -13
  43. data/lib/datadog/profiling/scheduler.rb +4 -6
  44. data/lib/datadog/profiling/stack_recorder.rb +13 -2
  45. data/lib/datadog/tracing/configuration/ext.rb +0 -1
  46. data/lib/datadog/tracing/configuration/settings.rb +2 -1
  47. data/lib/datadog/tracing/contrib/action_cable/configuration/settings.rb +1 -0
  48. data/lib/datadog/tracing/contrib/action_cable/ext.rb +1 -0
  49. data/lib/datadog/tracing/contrib/action_mailer/configuration/settings.rb +1 -0
  50. data/lib/datadog/tracing/contrib/action_mailer/ext.rb +1 -0
  51. data/lib/datadog/tracing/contrib/action_pack/configuration/settings.rb +1 -0
  52. data/lib/datadog/tracing/contrib/action_pack/ext.rb +1 -0
  53. data/lib/datadog/tracing/contrib/action_view/configuration/settings.rb +1 -0
  54. data/lib/datadog/tracing/contrib/action_view/ext.rb +1 -0
  55. data/lib/datadog/tracing/contrib/active_job/configuration/settings.rb +1 -0
  56. data/lib/datadog/tracing/contrib/active_job/ext.rb +1 -0
  57. data/lib/datadog/tracing/contrib/active_model_serializers/configuration/settings.rb +1 -0
  58. data/lib/datadog/tracing/contrib/active_model_serializers/ext.rb +1 -0
  59. data/lib/datadog/tracing/contrib/active_record/configuration/settings.rb +1 -0
  60. data/lib/datadog/tracing/contrib/active_record/ext.rb +1 -0
  61. data/lib/datadog/tracing/contrib/active_support/configuration/settings.rb +1 -0
  62. data/lib/datadog/tracing/contrib/active_support/ext.rb +1 -0
  63. data/lib/datadog/tracing/contrib/analytics.rb +0 -1
  64. data/lib/datadog/tracing/contrib/aws/configuration/settings.rb +1 -0
  65. data/lib/datadog/tracing/contrib/aws/ext.rb +1 -0
  66. data/lib/datadog/tracing/contrib/dalli/configuration/settings.rb +1 -0
  67. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  68. data/lib/datadog/tracing/contrib/delayed_job/configuration/settings.rb +1 -0
  69. data/lib/datadog/tracing/contrib/delayed_job/ext.rb +1 -0
  70. data/lib/datadog/tracing/contrib/elasticsearch/configuration/settings.rb +1 -0
  71. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +1 -0
  72. data/lib/datadog/tracing/contrib/ethon/configuration/settings.rb +1 -0
  73. data/lib/datadog/tracing/contrib/ethon/ext.rb +1 -0
  74. data/lib/datadog/tracing/contrib/excon/configuration/settings.rb +1 -0
  75. data/lib/datadog/tracing/contrib/excon/ext.rb +1 -0
  76. data/lib/datadog/tracing/contrib/faraday/configuration/settings.rb +7 -0
  77. data/lib/datadog/tracing/contrib/faraday/ext.rb +1 -0
  78. data/lib/datadog/tracing/contrib/faraday/middleware.rb +1 -1
  79. data/lib/datadog/tracing/contrib/grape/configuration/settings.rb +1 -0
  80. data/lib/datadog/tracing/contrib/grape/ext.rb +1 -0
  81. data/lib/datadog/tracing/contrib/graphql/configuration/settings.rb +1 -0
  82. data/lib/datadog/tracing/contrib/graphql/ext.rb +1 -0
  83. data/lib/datadog/tracing/contrib/grpc/configuration/settings.rb +1 -0
  84. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  85. data/lib/datadog/tracing/contrib/http/configuration/settings.rb +1 -0
  86. data/lib/datadog/tracing/contrib/http/distributed/fetcher.rb +2 -2
  87. data/lib/datadog/tracing/contrib/http/ext.rb +1 -0
  88. data/lib/datadog/tracing/contrib/httpclient/configuration/settings.rb +1 -0
  89. data/lib/datadog/tracing/contrib/httpclient/ext.rb +1 -0
  90. data/lib/datadog/tracing/contrib/httprb/configuration/settings.rb +1 -0
  91. data/lib/datadog/tracing/contrib/httprb/ext.rb +1 -0
  92. data/lib/datadog/tracing/contrib/kafka/configuration/settings.rb +1 -0
  93. data/lib/datadog/tracing/contrib/kafka/ext.rb +1 -0
  94. data/lib/datadog/tracing/contrib/mongodb/configuration/settings.rb +1 -0
  95. data/lib/datadog/tracing/contrib/mongodb/ext.rb +1 -0
  96. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +5 -0
  97. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  98. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +2 -1
  99. data/lib/datadog/tracing/contrib/opensearch/configuration/settings.rb +1 -0
  100. data/lib/datadog/tracing/contrib/opensearch/ext.rb +1 -0
  101. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +1 -0
  102. data/lib/datadog/tracing/contrib/pg/ext.rb +1 -0
  103. data/lib/datadog/tracing/contrib/presto/configuration/settings.rb +1 -0
  104. data/lib/datadog/tracing/contrib/presto/ext.rb +1 -0
  105. data/lib/datadog/tracing/contrib/qless/configuration/settings.rb +1 -0
  106. data/lib/datadog/tracing/contrib/qless/ext.rb +1 -0
  107. data/lib/datadog/tracing/contrib/que/configuration/settings.rb +1 -0
  108. data/lib/datadog/tracing/contrib/que/ext.rb +1 -0
  109. data/lib/datadog/tracing/contrib/racecar/configuration/settings.rb +1 -0
  110. data/lib/datadog/tracing/contrib/racecar/ext.rb +1 -0
  111. data/lib/datadog/tracing/contrib/rack/configuration/settings.rb +1 -0
  112. data/lib/datadog/tracing/contrib/rack/ext.rb +1 -0
  113. data/lib/datadog/tracing/contrib/rack/middlewares.rb +9 -2
  114. data/lib/datadog/tracing/contrib/rails/auto_instrument_railtie.rb +0 -2
  115. data/lib/datadog/tracing/contrib/rails/configuration/settings.rb +1 -0
  116. data/lib/datadog/tracing/contrib/rails/ext.rb +1 -0
  117. data/lib/datadog/tracing/contrib/rake/configuration/settings.rb +1 -0
  118. data/lib/datadog/tracing/contrib/rake/ext.rb +1 -0
  119. data/lib/datadog/tracing/contrib/redis/configuration/settings.rb +1 -0
  120. data/lib/datadog/tracing/contrib/redis/ext.rb +1 -0
  121. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +2 -2
  122. data/lib/datadog/tracing/contrib/redis/patcher.rb +34 -21
  123. data/lib/datadog/tracing/contrib/resque/configuration/settings.rb +1 -0
  124. data/lib/datadog/tracing/contrib/resque/ext.rb +1 -0
  125. data/lib/datadog/tracing/contrib/rest_client/configuration/settings.rb +1 -0
  126. data/lib/datadog/tracing/contrib/rest_client/ext.rb +1 -0
  127. data/lib/datadog/tracing/contrib/roda/configuration/settings.rb +1 -0
  128. data/lib/datadog/tracing/contrib/roda/ext.rb +1 -0
  129. data/lib/datadog/tracing/contrib/sequel/configuration/settings.rb +1 -0
  130. data/lib/datadog/tracing/contrib/sequel/ext.rb +1 -0
  131. data/lib/datadog/tracing/contrib/shoryuken/configuration/settings.rb +1 -0
  132. data/lib/datadog/tracing/contrib/shoryuken/ext.rb +1 -0
  133. data/lib/datadog/tracing/contrib/sidekiq/configuration/settings.rb +1 -0
  134. data/lib/datadog/tracing/contrib/sidekiq/ext.rb +1 -0
  135. data/lib/datadog/tracing/contrib/sinatra/configuration/settings.rb +1 -0
  136. data/lib/datadog/tracing/contrib/sinatra/ext.rb +1 -0
  137. data/lib/datadog/tracing/contrib/sneakers/configuration/settings.rb +1 -0
  138. data/lib/datadog/tracing/contrib/sneakers/ext.rb +1 -0
  139. data/lib/datadog/tracing/contrib/stripe/configuration/settings.rb +1 -0
  140. data/lib/datadog/tracing/contrib/stripe/ext.rb +1 -0
  141. data/lib/datadog/tracing/contrib/sucker_punch/configuration/settings.rb +1 -0
  142. data/lib/datadog/tracing/contrib/sucker_punch/ext.rb +1 -0
  143. data/lib/datadog/tracing/contrib/trilogy/configuration/settings.rb +58 -0
  144. data/lib/datadog/tracing/contrib/trilogy/ext.rb +27 -0
  145. data/lib/datadog/tracing/contrib/trilogy/instrumentation.rb +94 -0
  146. data/lib/datadog/tracing/contrib/trilogy/integration.rb +43 -0
  147. data/lib/datadog/tracing/contrib/trilogy/patcher.rb +31 -0
  148. data/lib/datadog/tracing/contrib.rb +1 -0
  149. data/lib/datadog/tracing.rb +8 -2
  150. data/lib/ddtrace/version.rb +1 -1
  151. metadata +20 -6
@@ -0,0 +1,349 @@
1
+ #include "collectors_discrete_dynamic_sampler.h"
2
+
3
+ #include <ruby.h>
4
+ #include "helpers.h"
5
+ #include "time_helpers.h"
6
+ #include "ruby_helpers.h"
7
+
8
+ #define BASE_OVERHEAD_PCT 1.0
9
+ #define BASE_SAMPLING_INTERVAL 50
10
+
11
+ #define ADJUSTMENT_WINDOW_NS SECONDS_AS_NS(1)
12
+
13
+ #define EMA_SMOOTHING_FACTOR 0.6
14
+ #define EXP_MOVING_AVERAGE(last, avg, first) first ? last : (1-EMA_SMOOTHING_FACTOR) * avg + EMA_SMOOTHING_FACTOR * last
15
+
16
+ void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name) {
17
+ sampler->debug_name = debug_name;
18
+ discrete_dynamic_sampler_set_overhead_target_percentage(sampler, BASE_OVERHEAD_PCT);
19
+ }
20
+
21
+ static void _discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler, long now_ns) {
22
+ const char *debug_name = sampler->debug_name;
23
+ double target_overhead = sampler->target_overhead;
24
+ (*sampler) = (discrete_dynamic_sampler) {
25
+ .debug_name = debug_name,
26
+ .target_overhead = target_overhead,
27
+ // Act as if a reset is a readjustment (it kinda is!) and wait for a full adjustment window
28
+ // to compute stats. Otherwise, we'd readjust on the next event that comes and thus be operating
29
+ // with very incomplete information
30
+ .last_readjust_time_ns = now_ns,
31
+ // This fake readjustment will use a hardcoded sampling interval
32
+ .sampling_interval = BASE_SAMPLING_INTERVAL,
33
+ .sampling_probability = 1.0 / BASE_SAMPLING_INTERVAL,
34
+ // But we want to make sure we sample at least once in the next window so that our first
35
+ // real readjustment has some notion of how heavy sampling is. Therefore, we'll make it so that
36
+ // the next event is automatically sampled by artificially locating it in the interval threshold.
37
+ .events_since_last_sample = BASE_SAMPLING_INTERVAL - 1,
38
+ };
39
+ }
40
+
41
+ void discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler) {
42
+ long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
43
+ _discrete_dynamic_sampler_reset(sampler, now);
44
+ }
45
+
46
+ static void _discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sampler *sampler, double target_overhead, long now_ns) {
47
+ if (target_overhead <= 0 || target_overhead > 100) {
48
+ rb_raise(rb_eArgError, "Target overhead must be a double between ]0,100] was %f", target_overhead);
49
+ }
50
+ sampler->target_overhead = target_overhead;
51
+ _discrete_dynamic_sampler_reset(sampler, now_ns);
52
+ }
53
+
54
+ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sampler *sampler, double target_overhead) {
55
+ long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
56
+ _discrete_dynamic_sampler_set_overhead_target_percentage(sampler, target_overhead, now);
57
+ }
58
+
59
+ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now);
60
+
61
+ static bool _discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns) {
62
+ // For efficiency reasons we don't do true random sampling but rather systematic
63
+ // sampling following a sample interval/skip. This can be biased and hide patterns
64
+ // but the dynamic interval and rather indeterministic pattern of allocations in
65
+ // most real applications should help reduce the bias impact.
66
+ sampler->events_since_last_sample++;
67
+ sampler->events_since_last_readjustment++;
68
+ bool should_sample = sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
69
+
70
+ if (should_sample) {
71
+ sampler->sample_start_time_ns = now_ns;
72
+ } else {
73
+ // check if we should readjust our sampler after this event, even if we didn't sample it
74
+ maybe_readjust(sampler, now_ns);
75
+ }
76
+
77
+ return should_sample;
78
+ }
79
+
80
+ bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler) {
81
+ long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
82
+ return _discrete_dynamic_sampler_should_sample(sampler, now);
83
+ }
84
+
85
+ static long _discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler, long now_ns) {
86
+ long last_sampling_time_ns = sampler->sample_start_time_ns == 0 ? 0 : long_max_of(0, now_ns - sampler->sample_start_time_ns);
87
+ sampler->samples_since_last_readjustment++;
88
+ sampler->sampling_time_since_last_readjustment_ns += last_sampling_time_ns;
89
+ sampler->events_since_last_sample = 0;
90
+
91
+ // check if we should readjust our sampler after this sample
92
+ maybe_readjust(sampler, now_ns);
93
+
94
+ return last_sampling_time_ns;
95
+ }
96
+
97
+ long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler) {
98
+ long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
99
+ return _discrete_dynamic_sampler_after_sample(sampler, now);
100
+ }
101
+
102
+ double discrete_dynamic_sampler_probability(discrete_dynamic_sampler *sampler) {
103
+ return sampler->sampling_probability * 100.;
104
+ }
105
+
106
+ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sampler *sampler) {
107
+ return sampler->events_since_last_sample;
108
+ }
109
+
110
+ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
111
+ long window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now - sampler->last_readjust_time_ns;
112
+
113
+ if (window_time_ns < ADJUSTMENT_WINDOW_NS) {
114
+ // not enough time has passed to perform a readjustment
115
+ return;
116
+ }
117
+
118
+ // If we got this far, lets recalculate our sampling params based on new observations
119
+ bool first_readjustment = !sampler->has_completed_full_adjustment_window;
120
+
121
+ // Update our running average of events/sec with latest observation
122
+ sampler->events_per_ns = EXP_MOVING_AVERAGE(
123
+ (double) sampler->events_since_last_readjustment / window_time_ns,
124
+ sampler->events_per_ns,
125
+ first_readjustment
126
+ );
127
+
128
+ // Update our running average of sampling time for a specific event
129
+ long sampling_window_time_ns = sampler->sampling_time_since_last_readjustment_ns;
130
+ long sampling_overshoot_time_ns = -1;
131
+ if (sampler->samples_since_last_readjustment > 0) {
132
+ // We can only update sampling-related stats if we actually sampled on the last window...
133
+
134
+ // Lets update our average sampling time per event
135
+ long avg_sampling_time_in_window_ns = sampler->samples_since_last_readjustment == 0 ? 0 : sampling_window_time_ns / sampler->samples_since_last_readjustment;
136
+ sampler->sampling_time_ns = EXP_MOVING_AVERAGE(
137
+ avg_sampling_time_in_window_ns,
138
+ sampler->sampling_time_ns,
139
+ first_readjustment
140
+ );
141
+ }
142
+
143
+ // Are we meeting our target in practice? If we're consistently overshooting our estimate due to non-uniform allocation patterns lets
144
+ // adjust our overhead target.
145
+ // NOTE: Updating this even when no samples occur is a conscious choice which enables us to cooldown extreme adjustments over time.
146
+ // If we didn't do this, whenever a big spike caused target_overhead_adjustment to equal target_overhead, we'd get stuck
147
+ // in a "probability = 0" state.
148
+ long reference_target_sampling_time_ns = window_time_ns * (sampler->target_overhead / 100.);
149
+ // Overshoot by definition is always >= 0. < 0 would be undershooting!
150
+ sampling_overshoot_time_ns = long_max_of(0, sampler->sampling_time_since_last_readjustment_ns - reference_target_sampling_time_ns);
151
+ // Our overhead adjustment should always be between [-target_overhead, 0]. Higher adjustments would lead to negative overhead targets
152
+ // which don't make much sense.
153
+ double last_target_overhead_adjustment = -double_min_of(sampler->target_overhead, sampling_overshoot_time_ns * 100. / window_time_ns);
154
+ sampler->target_overhead_adjustment = EXP_MOVING_AVERAGE(
155
+ last_target_overhead_adjustment,
156
+ sampler->target_overhead_adjustment,
157
+ first_readjustment
158
+ );
159
+
160
+ // Apply our overhead adjustment to figure out our real targets for this readjustment.
161
+ double target_overhead = double_max_of(0, sampler->target_overhead + sampler->target_overhead_adjustment);
162
+ long target_sampling_time_ns = window_time_ns * (target_overhead / 100.);
163
+
164
+ // Recalculate target sampling probability so that the following 2 hold:
165
+ // * window_time_ns = working_window_time_ns + sampling_window_time_ns
166
+ // │ │ │
167
+ // │ │ └ how much time is spent sampling
168
+ // │ └── how much time is spent doing actual app stuff
169
+ // └── total (wall) time in this adjustment window
170
+ // * sampling_window_time_ns <= window_time_ns * target_overhead / 100
171
+ //
172
+ // Note that
173
+ //
174
+ // sampling_window_time_ns = samples_in_window * sampling_time_ns =
175
+ // ┌─ assuming no events will be emitted during sampling
176
+ // │
177
+ // = events_per_ns * working_window_time_ns * sampling_probability * sampling_time_ns
178
+ //
179
+ // Re-ordering for sampling_probability and solving for the upper-bound of sampling_window_time_ns:
180
+ //
181
+ // sampling_window_time_ns = window_time_ns * target_overhead / 100
182
+ // sampling_probability = window_time_ns * target_overhead / 100 / (events_per_ns * working_window_time_ns * sampling_time_ns) =
183
+ //
184
+ // Which you can intuitively understand as:
185
+ //
186
+ // sampling_probability = max_allowed_time_for_sampling_ns / time_to_sample_all_events_ns
187
+ //
188
+ // As a quick sanity check:
189
+ // * If app is eventing very little or we're sampling very fast, so that time_to_sample_all_events_ns < max_allowed_time_for_sampling_ns
190
+ // then probability will be > 1 (but we should clamp to 1 since probabilities higher than 1 don't make sense).
191
+ // * If app is eventing a lot or our sampling overhead is big, then as time_to_sample_all_events_ns grows, sampling_probability will
192
+ // tend to 0.
193
+ long working_window_time_ns = long_max_of(0, window_time_ns - sampling_window_time_ns);
194
+ double max_allowed_time_for_sampling_ns = target_sampling_time_ns;
195
+ long time_to_sample_all_events_ns = sampler->events_per_ns * working_window_time_ns * sampler->sampling_time_ns;
196
+ if (max_allowed_time_for_sampling_ns == 0) {
197
+ // if we aren't allowed any sampling time at all, probability has to be 0
198
+ sampler->sampling_probability = 0;
199
+ } else {
200
+ // otherwise apply the formula described above (protecting against div by 0)
201
+ sampler->sampling_probability = time_to_sample_all_events_ns == 0 ? 1. :
202
+ double_min_of(1., max_allowed_time_for_sampling_ns / time_to_sample_all_events_ns);
203
+ }
204
+
205
+ // Doing true random selection would involve "tossing a coin" on every allocation. Lets do systematic sampling instead so that our
206
+ // sampling decision can rely solely on a sampling skip/interval (i.e. more efficient).
207
+ //
208
+ // sampling_interval = events / samples =
209
+ // = event_rate * working_window_time_ns / (event_rate * working_window_time_ns * sampling_probability)
210
+ // = 1 / sampling_probability
211
+ //
212
+ // NOTE: The sampling interval has to be an integer since we're dealing with discrete events here. This means that there'll be
213
+ // a loss of precision (and thus control) when adjusting between probabilities that lead to non-integer granularity
214
+ // changes (e.g. probabilities in the range of ]50%, 100%[ which map to intervals in the range of ]1, 2[). Our approach
215
+ // when the sampling interval is a non-integer is to ceil it (i.e. we'll always choose to sample less often).
216
+ // NOTE: Overhead target adjustments or very big sampling times can in theory bring probability so close to 0 as to effectively
217
+ // round down to full 0. This means we have to be careful to handle div-by-0 as well as resulting double intervals that
218
+ // are so big they don't fit into the sampling_interval. In both cases lets just disable sampling until next readjustment
219
+ // by setting interval to 0.
220
+ double sampling_interval = sampler->sampling_probability == 0 ? 0 : ceil(1.0 / sampler->sampling_probability);
221
+ sampler->sampling_interval = sampling_interval > ULONG_MAX ? 0 : sampling_interval;
222
+
223
+ #ifdef DD_DEBUG
224
+ double allocs_in_60s = sampler->events_per_ns * 1e9 * 60;
225
+ double samples_in_60s = allocs_in_60s * sampler->sampling_probability;
226
+ double expected_total_sampling_time_in_60s =
227
+ samples_in_60s * sampler->sampling_time_ns / 1e9;
228
+ double real_total_sampling_time_in_60s = sampling_window_time_ns / 1e9 * 60 / (window_time_ns / 1e9);
229
+
230
+ fprintf(stderr, "[dds.%s] readjusting...\n", sampler->debug_name);
231
+ fprintf(stderr, "samples_since_last_readjustment=%ld\n", sampler->samples_since_last_readjustment);
232
+ fprintf(stderr, "window_time=%ld\n", window_time_ns);
233
+ fprintf(stderr, "events_per_sec=%f\n", sampler->events_per_ns * 1e9);
234
+ fprintf(stderr, "sampling_time=%ld\n", sampler->sampling_time_ns);
235
+ fprintf(stderr, "sampling_window_time=%ld\n", sampling_window_time_ns);
236
+ fprintf(stderr, "sampling_target_time=%ld\n", reference_target_sampling_time_ns);
237
+ fprintf(stderr, "sampling_overshoot_time=%ld\n", sampling_overshoot_time_ns);
238
+ fprintf(stderr, "working_window_time=%ld\n", working_window_time_ns);
239
+ fprintf(stderr, "sampling_interval=%zu\n", sampler->sampling_interval);
240
+ fprintf(stderr, "sampling_probability=%f\n", sampler->sampling_probability);
241
+ fprintf(stderr, "expected allocs in 60s=%f\n", allocs_in_60s);
242
+ fprintf(stderr, "expected samples in 60s=%f\n", samples_in_60s);
243
+ fprintf(stderr, "expected sampling time in 60s=%f (previous real=%f)\n", expected_total_sampling_time_in_60s, real_total_sampling_time_in_60s);
244
+ fprintf(stderr, "target_overhead=%f\n", sampler->target_overhead);
245
+ fprintf(stderr, "target_overhead_adjustment=%f\n", sampler->target_overhead_adjustment);
246
+ fprintf(stderr, "target_sampling_time=%ld\n", target_sampling_time_ns);
247
+ fprintf(stderr, "expected max overhead in 60s=%f\n", target_overhead / 100.0 * 60);
248
+ fprintf(stderr, "-------\n");
249
+ #endif
250
+
251
+ sampler->events_since_last_readjustment = 0;
252
+ sampler->samples_since_last_readjustment = 0;
253
+ sampler->sampling_time_since_last_readjustment_ns = 0;
254
+ sampler->last_readjust_time_ns = now;
255
+ sampler->has_completed_full_adjustment_window = true;
256
+ }
257
+
258
+ // ---
259
+ // Below here is boilerplate to expose the above code to Ruby so that we can test it with RSpec as usual.
260
+
261
+ static VALUE _native_new(VALUE klass);
262
+ static VALUE _native_reset(VALUE self, VALUE now);
263
+ static VALUE _native_set_overhead_target_percentage(VALUE self, VALUE target_overhead, VALUE now);
264
+ static VALUE _native_should_sample(VALUE self, VALUE now);
265
+ static VALUE _native_after_sample(VALUE self, VALUE now);
266
+ static VALUE _native_probability(VALUE self);
267
+
268
+ typedef struct sampler_state {
269
+ discrete_dynamic_sampler sampler;
270
+ } sampler_state;
271
+
272
+ void collectors_discrete_dynamic_sampler_init(VALUE profiling_module) {
273
+ VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
274
+ VALUE discrete_sampler_module = rb_define_module_under(collectors_module, "DiscreteDynamicSampler");
275
+ VALUE testing_module = rb_define_module_under(discrete_sampler_module, "Testing");
276
+ VALUE sampler_class = rb_define_class_under(testing_module, "Sampler", rb_cObject);
277
+
278
+ rb_define_alloc_func(sampler_class, _native_new);
279
+
280
+ rb_define_method(sampler_class, "_native_reset", _native_reset, 1);
281
+ rb_define_method(sampler_class, "_native_set_overhead_target_percentage", _native_set_overhead_target_percentage, 2);
282
+ rb_define_method(sampler_class, "_native_should_sample", _native_should_sample, 1);
283
+ rb_define_method(sampler_class, "_native_after_sample", _native_after_sample, 1);
284
+ rb_define_method(sampler_class, "_native_probability", _native_probability, 0);
285
+ }
286
+
287
+ static const rb_data_type_t sampler_typed_data = {
288
+ .wrap_struct_name = "Datadog::Profiling::DiscreteDynamicSampler::Testing::Sampler",
289
+ .function = {
290
+ .dfree = RUBY_DEFAULT_FREE,
291
+ .dsize = NULL,
292
+ },
293
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
294
+ };
295
+
296
+ static VALUE _native_new(VALUE klass) {
297
+ sampler_state *state = ruby_xcalloc(sizeof(sampler_state), 1);
298
+
299
+ discrete_dynamic_sampler_init(&state->sampler, "test sampler");
300
+
301
+ return TypedData_Wrap_Struct(klass, &sampler_typed_data, state);
302
+ }
303
+
304
+ static VALUE _native_reset(VALUE self, VALUE now_ns) {
305
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
306
+
307
+ sampler_state *state;
308
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
309
+
310
+ _discrete_dynamic_sampler_reset(&state->sampler, NUM2LONG(now_ns));
311
+ return Qtrue;
312
+ }
313
+
314
+ static VALUE _native_set_overhead_target_percentage(VALUE self, VALUE target_overhead, VALUE now_ns) {
315
+ ENFORCE_TYPE(target_overhead, T_FLOAT);
316
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
317
+
318
+ sampler_state *state;
319
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
320
+
321
+ _discrete_dynamic_sampler_set_overhead_target_percentage(&state->sampler, NUM2DBL(target_overhead), NUM2LONG(now_ns));
322
+
323
+ return Qnil;
324
+ }
325
+
326
+ VALUE _native_should_sample(VALUE self, VALUE now_ns) {
327
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
328
+
329
+ sampler_state *state;
330
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
331
+
332
+ return _discrete_dynamic_sampler_should_sample(&state->sampler, NUM2LONG(now_ns)) ? Qtrue : Qfalse;
333
+ }
334
+
335
+ VALUE _native_after_sample(VALUE self, VALUE now_ns) {
336
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
337
+
338
+ sampler_state *state;
339
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
340
+
341
+ return LONG2NUM(_discrete_dynamic_sampler_after_sample(&state->sampler, NUM2LONG(now_ns)));
342
+ }
343
+
344
+ VALUE _native_probability(VALUE self) {
345
+ sampler_state *state;
346
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
347
+
348
+ return DBL2NUM(discrete_dynamic_sampler_probability(&state->sampler));
349
+ }
@@ -0,0 +1,89 @@
1
+ #pragma once
2
+
3
+ #include <stdbool.h>
4
+ #include <stddef.h>
5
+
6
+ // A sampler that will sample discrete events based on the overhead of their
7
+ // sampling.
8
+ //
9
+ // NOTE: For performance reasons, this sampler does systematic sampling via
10
+ // sampling intervals/skips that are dynamically adjusted over time.
11
+ // It will not perform truly random sampling by "throwing a coin" at
12
+ // every event and is thus, in theory, susceptible to some pattern
13
+ // biases. In practice, the dynamic readjustment of sampling interval
14
+ // and randomized starting point should help with avoiding heavy biases.
15
+ typedef struct discrete_dynamic_sampler {
16
+ // --- Config ---
17
+ // Name of this sampler for debug logs.
18
+ const char *debug_name;
19
+ // Value in the range ]0, 100] representing the % of time we're willing to dedicate
20
+ // to sampling.
21
+ double target_overhead;
22
+
23
+ // -- Reference State ---
24
+ // Moving average of how many events per ns we saw over the recent past.
25
+ double events_per_ns;
26
+ // Moving average of the sampling time of each individual event.
27
+ long sampling_time_ns;
28
+ // Sampling probability being applied by this sampler.
29
+ double sampling_probability;
30
+ // Sampling interval/skip that drives the systematic sampling done by this sampler.
31
+ // NOTE: This is an inverted view of the probability.
32
+ // NOTE: A value of 0 works as +inf, effectively disabling sampling (to align with probability=0)
33
+ unsigned long sampling_interval;
34
+
35
+ // -- Sampling State --
36
+ // How many events have we seen since we last decided to sample.
37
+ unsigned long events_since_last_sample;
38
+ // Captures the time at which the last true-returning call to should_sample happened.
39
+ // This is used in after_sample to understand the total sample time.
40
+ long sample_start_time_ns;
41
+
42
+ // -- Adjustment State --
43
+ // Has this sampler already ran for at least one complete adjustment window?
44
+ bool has_completed_full_adjustment_window;
45
+ // Time at which we last readjust our sampling parameters.
46
+ long last_readjust_time_ns;
47
+ // How many events have we seen since the last readjustment.
48
+ unsigned long events_since_last_readjustment;
49
+ // How many samples have we seen since the last readjustment.
50
+ unsigned long samples_since_last_readjustment;
51
+ // How much time have we spent sampling since the last readjustment.
52
+ unsigned long sampling_time_since_last_readjustment_ns;
53
+ // A negative number that we add to target_overhead to serve as extra padding to
54
+ // try and mitigate observed overshooting of max sampling time.
55
+ double target_overhead_adjustment;
56
+ } discrete_dynamic_sampler;
57
+
58
+
59
+ // Init a new sampler with sane defaults.
60
+ void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name);
61
+
62
+ // Reset a sampler, clearing all stored state.
63
+ void discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler);
64
+
65
+ // Sets a new target_overhead for the provided sampler, resetting it in the process.
66
+ // @param target_overhead A double representing the percentage of total time we are
67
+ // willing to use as overhead for the resulting sampling. Values are expected
68
+ // to be in the range ]0.0, 100.0].
69
+ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sampler *sampler, double target_overhead);
70
+
71
+ // Make a sampling decision.
72
+ //
73
+ // @return True if the event associated with this decision should be sampled, false
74
+ // otherwise.
75
+ //
76
+ // NOTE: If true is returned we implicitly assume the start of a sampling operation
77
+ // and it is expected that a follow-up after_sample call is issued.
78
+ bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler);
79
+
80
+ // Signal the end of a sampling operation.
81
+ //
82
+ // @return Sampling time in nanoseconds for the sample operation we just finished.
83
+ long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler);
84
+
85
+ // Retrieve the current sampling probability ([0.0, 100.0]) being applied by this sampler.
86
+ double discrete_dynamic_sampler_probability(discrete_dynamic_sampler *sampler);
87
+
88
+ // Retrieve the current number of events seen since last sample.
89
+ unsigned long discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sampler *sampler);
@@ -19,7 +19,7 @@
19
19
  //
20
20
  // Instead of sampling at a fixed sample rate, the actual sampling rate should be decided by also observing the impact
21
21
  // that running the profiler is having. This protects against issues such as the profiler being deployed in very busy
22
- //machines or containers with unrealistic CPU restrictions.
22
+ // machines or containers with unrealistic CPU restrictions.
23
23
  //
24
24
  // ### Implementation
25
25
  //
@@ -35,13 +35,13 @@
35
35
  // sample. If it's not, it will skip sampling.
36
36
  //
37
37
  // Finally, as an additional optimization, there's a `dynamic_sampling_rate_get_sleep()` which, given the current
38
- // wall-time, will return the time remaining (*there's an exception, check below) until the next sample.
38
+ // wall-time, will return the time remaining (*there's an exception, check function) until the next sample.
39
39
  //
40
40
  // ---
41
41
 
42
42
  // This is the wall-time overhead we're targeting. E.g. we target to spend no more than 2%, or 1.2 seconds per minute,
43
- // taking profiling samples.
44
- #define WALL_TIME_OVERHEAD_TARGET_PERCENTAGE 2.0 // %
43
+ // taking profiling samples by default.
44
+ #define DEFAULT_WALL_TIME_OVERHEAD_TARGET_PERCENTAGE 2.0 // %
45
45
  // See `dynamic_sampling_rate_get_sleep()` for details
46
46
  #define MAX_SLEEP_TIME_NS MILLIS_AS_NS(100)
47
47
  // See `dynamic_sampling_rate_after_sample()` for details
@@ -49,6 +49,11 @@
49
49
 
50
50
  void dynamic_sampling_rate_init(dynamic_sampling_rate_state *state) {
51
51
  atomic_init(&state->next_sample_after_monotonic_wall_time_ns, 0);
52
+ dynamic_sampling_rate_set_overhead_target_percentage(state, DEFAULT_WALL_TIME_OVERHEAD_TARGET_PERCENTAGE);
53
+ }
54
+
55
+ void dynamic_sampling_rate_set_overhead_target_percentage(dynamic_sampling_rate_state *state, double overhead_target_percentage) {
56
+ state->overhead_target_percentage = overhead_target_percentage;
52
57
  }
53
58
 
54
59
  void dynamic_sampling_rate_reset(dynamic_sampling_rate_state *state) {
@@ -76,7 +81,7 @@ bool dynamic_sampling_rate_should_sample(dynamic_sampling_rate_state *state, lon
76
81
  }
77
82
 
78
83
  void dynamic_sampling_rate_after_sample(dynamic_sampling_rate_state *state, long wall_time_ns_after_sample, uint64_t sampling_time_ns) {
79
- double overhead_target = (double) WALL_TIME_OVERHEAD_TARGET_PERCENTAGE;
84
+ double overhead_target = state->overhead_target_percentage;
80
85
 
81
86
  // The idea here is that we're targeting a maximum % of wall-time spent sampling.
82
87
  // So for instance, if sampling_time_ns is 2% of the time we spend working, how much is the 98% we should spend
@@ -93,48 +98,51 @@ void dynamic_sampling_rate_after_sample(dynamic_sampling_rate_state *state, long
93
98
  // ---
94
99
  // Below here is boilerplate to expose the above code to Ruby so that we can test it with RSpec as usual.
95
100
 
96
- VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns);
97
- VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample);
98
- VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns);
101
+ VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns);
102
+ VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample);
103
+ VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns);
99
104
 
100
105
  void collectors_dynamic_sampling_rate_init(VALUE profiling_module) {
101
106
  VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
102
107
  VALUE dynamic_sampling_rate_module = rb_define_module_under(collectors_module, "DynamicSamplingRate");
103
108
  VALUE testing_module = rb_define_module_under(dynamic_sampling_rate_module, "Testing");
104
109
 
105
- rb_define_singleton_method(testing_module, "_native_get_sleep", _native_get_sleep, 2);
106
- rb_define_singleton_method(testing_module, "_native_should_sample", _native_should_sample, 2);
107
- rb_define_singleton_method(testing_module, "_native_after_sample", _native_after_sample, 2);
110
+ rb_define_singleton_method(testing_module, "_native_get_sleep", _native_get_sleep, 3);
111
+ rb_define_singleton_method(testing_module, "_native_should_sample", _native_should_sample, 3);
112
+ rb_define_singleton_method(testing_module, "_native_after_sample", _native_after_sample, 3);
108
113
  }
109
114
 
110
- VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns) {
115
+ VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns) {
111
116
  ENFORCE_TYPE(simulated_next_sample_after_monotonic_wall_time_ns, T_FIXNUM);
112
117
  ENFORCE_TYPE(current_monotonic_wall_time_ns, T_FIXNUM);
113
118
 
114
119
  dynamic_sampling_rate_state state;
115
120
  dynamic_sampling_rate_init(&state);
121
+ dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
116
122
  atomic_store(&state.next_sample_after_monotonic_wall_time_ns, NUM2LONG(simulated_next_sample_after_monotonic_wall_time_ns));
117
123
 
118
124
  return ULL2NUM(dynamic_sampling_rate_get_sleep(&state, NUM2LONG(current_monotonic_wall_time_ns)));
119
125
  }
120
126
 
121
- VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample) {
127
+ VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample) {
122
128
  ENFORCE_TYPE(simulated_next_sample_after_monotonic_wall_time_ns, T_FIXNUM);
123
129
  ENFORCE_TYPE(wall_time_ns_before_sample, T_FIXNUM);
124
130
 
125
131
  dynamic_sampling_rate_state state;
126
132
  dynamic_sampling_rate_init(&state);
133
+ dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
127
134
  atomic_store(&state.next_sample_after_monotonic_wall_time_ns, NUM2LONG(simulated_next_sample_after_monotonic_wall_time_ns));
128
135
 
129
136
  return dynamic_sampling_rate_should_sample(&state, NUM2LONG(wall_time_ns_before_sample)) ? Qtrue : Qfalse;
130
137
  }
131
138
 
132
- VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns) {
139
+ VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns) {
133
140
  ENFORCE_TYPE(wall_time_ns_after_sample, T_FIXNUM);
134
141
  ENFORCE_TYPE(sampling_time_ns, T_FIXNUM);
135
142
 
136
143
  dynamic_sampling_rate_state state;
137
144
  dynamic_sampling_rate_init(&state);
145
+ dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
138
146
 
139
147
  dynamic_sampling_rate_after_sample(&state, NUM2LONG(wall_time_ns_after_sample), NUM2ULL(sampling_time_ns));
140
148
 
@@ -4,10 +4,14 @@
4
4
  #include <stdbool.h>
5
5
 
6
6
  typedef struct {
7
+ // This is the wall-time overhead we're targeting. E.g. by default, we target to spend no more than 2%, or 1.2 seconds
8
+ // per minute, taking profiling samples.
9
+ double overhead_target_percentage;
7
10
  atomic_long next_sample_after_monotonic_wall_time_ns;
8
11
  } dynamic_sampling_rate_state;
9
12
 
10
13
  void dynamic_sampling_rate_init(dynamic_sampling_rate_state *state);
14
+ void dynamic_sampling_rate_set_overhead_target_percentage(dynamic_sampling_rate_state *state, double overhead_target_percentage);
11
15
  void dynamic_sampling_rate_reset(dynamic_sampling_rate_state *state);
12
16
  uint64_t dynamic_sampling_rate_get_sleep(dynamic_sampling_rate_state *state, long current_monotonic_wall_time_ns);
13
17
  bool dynamic_sampling_rate_should_sample(dynamic_sampling_rate_state *state, long wall_time_ns_before_sample);