ddtrace 1.19.0 → 1.21.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (194) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +93 -2
  3. data/LICENSE-3rdparty.csv +1 -1
  4. data/bin/ddprofrb +15 -0
  5. data/bin/ddtracerb +3 -1
  6. data/ext/{ddtrace_profiling_loader/ddtrace_profiling_loader.c → datadog_profiling_loader/datadog_profiling_loader.c} +2 -2
  7. data/ext/{ddtrace_profiling_loader → datadog_profiling_loader}/extconf.rb +3 -3
  8. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_cpu_and_wall_time_worker.c +237 -65
  9. data/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c +422 -0
  10. data/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h +101 -0
  11. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_thread_context.c +92 -2
  12. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/extconf.rb +5 -2
  13. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/helpers.h +4 -0
  14. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/http_transport.c +10 -14
  15. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/native_extension_helpers.rb +4 -4
  16. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/private_vm_api_access.c +14 -0
  17. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/private_vm_api_access.h +4 -0
  18. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/profiling.c +17 -1
  19. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/ruby_helpers.c +10 -0
  20. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/ruby_helpers.h +2 -0
  21. data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/time_helpers.h +2 -0
  22. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +2 -1
  23. data/lib/datadog/core/configuration/components.rb +5 -5
  24. data/lib/datadog/core/configuration/option.rb +1 -1
  25. data/lib/datadog/core/configuration/settings.rb +107 -46
  26. data/lib/datadog/core/diagnostics/environment_logger.rb +4 -3
  27. data/lib/datadog/core/environment/class_count.rb +6 -6
  28. data/lib/datadog/core/environment/git.rb +25 -0
  29. data/lib/datadog/core/environment/identity.rb +18 -48
  30. data/lib/datadog/core/git/ext.rb +2 -23
  31. data/lib/datadog/core/remote/component.rb +25 -12
  32. data/lib/datadog/core/remote/ext.rb +1 -0
  33. data/lib/datadog/core/remote/negotiation.rb +2 -2
  34. data/lib/datadog/core/remote/tie/tracing.rb +39 -0
  35. data/lib/datadog/core/remote/tie.rb +27 -0
  36. data/lib/datadog/core/remote/worker.rb +7 -4
  37. data/lib/datadog/core/transport/ext.rb +2 -0
  38. data/lib/datadog/core/utils/url.rb +25 -0
  39. data/lib/datadog/opentelemetry/sdk/propagator.rb +3 -2
  40. data/lib/datadog/opentelemetry.rb +3 -0
  41. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +6 -2
  42. data/lib/datadog/profiling/collectors/info.rb +101 -0
  43. data/lib/datadog/profiling/component.rb +14 -30
  44. data/lib/datadog/profiling/exporter.rb +19 -5
  45. data/lib/datadog/profiling/ext.rb +2 -0
  46. data/lib/datadog/profiling/flush.rb +6 -3
  47. data/lib/datadog/profiling/http_transport.rb +5 -1
  48. data/lib/datadog/profiling/load_native_extension.rb +5 -5
  49. data/lib/datadog/profiling/native_extension.rb +1 -1
  50. data/lib/datadog/profiling/tag_builder.rb +5 -0
  51. data/lib/datadog/profiling/tasks/exec.rb +3 -3
  52. data/lib/datadog/profiling/tasks/help.rb +3 -3
  53. data/lib/datadog/profiling.rb +2 -2
  54. data/lib/datadog/tracing/configuration/ext.rb +0 -1
  55. data/lib/datadog/tracing/configuration/settings.rb +2 -1
  56. data/lib/datadog/tracing/contrib/action_cable/configuration/settings.rb +1 -0
  57. data/lib/datadog/tracing/contrib/action_cable/ext.rb +1 -0
  58. data/lib/datadog/tracing/contrib/action_mailer/configuration/settings.rb +1 -0
  59. data/lib/datadog/tracing/contrib/action_mailer/ext.rb +1 -0
  60. data/lib/datadog/tracing/contrib/action_pack/configuration/settings.rb +1 -0
  61. data/lib/datadog/tracing/contrib/action_pack/ext.rb +1 -0
  62. data/lib/datadog/tracing/contrib/action_view/configuration/settings.rb +1 -0
  63. data/lib/datadog/tracing/contrib/action_view/ext.rb +1 -0
  64. data/lib/datadog/tracing/contrib/active_job/configuration/settings.rb +1 -0
  65. data/lib/datadog/tracing/contrib/active_job/ext.rb +1 -0
  66. data/lib/datadog/tracing/contrib/active_model_serializers/configuration/settings.rb +1 -0
  67. data/lib/datadog/tracing/contrib/active_model_serializers/ext.rb +1 -0
  68. data/lib/datadog/tracing/contrib/active_record/configuration/settings.rb +1 -0
  69. data/lib/datadog/tracing/contrib/active_record/ext.rb +1 -0
  70. data/lib/datadog/tracing/contrib/active_support/configuration/settings.rb +1 -0
  71. data/lib/datadog/tracing/contrib/active_support/ext.rb +1 -0
  72. data/lib/datadog/tracing/contrib/analytics.rb +0 -1
  73. data/lib/datadog/tracing/contrib/aws/configuration/settings.rb +1 -0
  74. data/lib/datadog/tracing/contrib/aws/ext.rb +1 -0
  75. data/lib/datadog/tracing/contrib/concurrent_ruby/async_patch.rb +20 -0
  76. data/lib/datadog/tracing/contrib/concurrent_ruby/patcher.rb +11 -1
  77. data/lib/datadog/tracing/contrib/dalli/configuration/settings.rb +1 -0
  78. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  79. data/lib/datadog/tracing/contrib/delayed_job/configuration/settings.rb +1 -0
  80. data/lib/datadog/tracing/contrib/delayed_job/ext.rb +1 -0
  81. data/lib/datadog/tracing/contrib/elasticsearch/configuration/settings.rb +1 -0
  82. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +1 -0
  83. data/lib/datadog/tracing/contrib/ethon/configuration/settings.rb +1 -0
  84. data/lib/datadog/tracing/contrib/ethon/ext.rb +1 -0
  85. data/lib/datadog/tracing/contrib/excon/configuration/settings.rb +1 -0
  86. data/lib/datadog/tracing/contrib/excon/ext.rb +1 -0
  87. data/lib/datadog/tracing/contrib/extensions.rb +6 -2
  88. data/lib/datadog/tracing/contrib/faraday/configuration/settings.rb +7 -0
  89. data/lib/datadog/tracing/contrib/faraday/ext.rb +1 -0
  90. data/lib/datadog/tracing/contrib/faraday/middleware.rb +1 -1
  91. data/lib/datadog/tracing/contrib/grape/configuration/settings.rb +1 -0
  92. data/lib/datadog/tracing/contrib/grape/endpoint.rb +5 -0
  93. data/lib/datadog/tracing/contrib/grape/ext.rb +1 -0
  94. data/lib/datadog/tracing/contrib/graphql/configuration/settings.rb +1 -0
  95. data/lib/datadog/tracing/contrib/graphql/ext.rb +1 -0
  96. data/lib/datadog/tracing/contrib/grpc/configuration/settings.rb +1 -0
  97. data/lib/datadog/tracing/contrib/grpc/ext.rb +1 -0
  98. data/lib/datadog/tracing/contrib/http/configuration/settings.rb +1 -0
  99. data/lib/datadog/tracing/contrib/http/distributed/fetcher.rb +2 -2
  100. data/lib/datadog/tracing/contrib/http/ext.rb +1 -0
  101. data/lib/datadog/tracing/contrib/httpclient/configuration/settings.rb +1 -0
  102. data/lib/datadog/tracing/contrib/httpclient/ext.rb +1 -0
  103. data/lib/datadog/tracing/contrib/httprb/configuration/settings.rb +1 -0
  104. data/lib/datadog/tracing/contrib/httprb/ext.rb +1 -0
  105. data/lib/datadog/tracing/contrib/kafka/configuration/settings.rb +1 -0
  106. data/lib/datadog/tracing/contrib/kafka/ext.rb +1 -0
  107. data/lib/datadog/tracing/contrib/mongodb/configuration/settings.rb +1 -0
  108. data/lib/datadog/tracing/contrib/mongodb/ext.rb +1 -0
  109. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +1 -0
  110. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  111. data/lib/datadog/tracing/contrib/opensearch/configuration/settings.rb +1 -0
  112. data/lib/datadog/tracing/contrib/opensearch/ext.rb +1 -0
  113. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +1 -0
  114. data/lib/datadog/tracing/contrib/pg/ext.rb +1 -0
  115. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +11 -4
  116. data/lib/datadog/tracing/contrib/presto/configuration/settings.rb +1 -0
  117. data/lib/datadog/tracing/contrib/presto/ext.rb +1 -0
  118. data/lib/datadog/tracing/contrib/qless/configuration/settings.rb +1 -0
  119. data/lib/datadog/tracing/contrib/qless/ext.rb +1 -0
  120. data/lib/datadog/tracing/contrib/que/configuration/settings.rb +1 -0
  121. data/lib/datadog/tracing/contrib/que/ext.rb +1 -0
  122. data/lib/datadog/tracing/contrib/racecar/configuration/settings.rb +1 -0
  123. data/lib/datadog/tracing/contrib/racecar/ext.rb +1 -0
  124. data/lib/datadog/tracing/contrib/rack/configuration/settings.rb +1 -0
  125. data/lib/datadog/tracing/contrib/rack/ext.rb +1 -0
  126. data/lib/datadog/tracing/contrib/rack/middlewares.rb +37 -6
  127. data/lib/datadog/tracing/contrib/rails/configuration/settings.rb +1 -0
  128. data/lib/datadog/tracing/contrib/rails/ext.rb +1 -0
  129. data/lib/datadog/tracing/contrib/rails/patcher.rb +16 -0
  130. data/lib/datadog/tracing/contrib/rake/configuration/settings.rb +1 -0
  131. data/lib/datadog/tracing/contrib/rake/ext.rb +1 -0
  132. data/lib/datadog/tracing/contrib/redis/configuration/settings.rb +1 -0
  133. data/lib/datadog/tracing/contrib/redis/ext.rb +1 -0
  134. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +2 -2
  135. data/lib/datadog/tracing/contrib/redis/patcher.rb +34 -21
  136. data/lib/datadog/tracing/contrib/resque/configuration/settings.rb +1 -0
  137. data/lib/datadog/tracing/contrib/resque/ext.rb +1 -0
  138. data/lib/datadog/tracing/contrib/rest_client/configuration/settings.rb +1 -0
  139. data/lib/datadog/tracing/contrib/rest_client/ext.rb +1 -0
  140. data/lib/datadog/tracing/contrib/roda/configuration/settings.rb +1 -0
  141. data/lib/datadog/tracing/contrib/roda/ext.rb +1 -0
  142. data/lib/datadog/tracing/contrib/sequel/configuration/settings.rb +1 -0
  143. data/lib/datadog/tracing/contrib/sequel/ext.rb +1 -0
  144. data/lib/datadog/tracing/contrib/shoryuken/configuration/settings.rb +1 -0
  145. data/lib/datadog/tracing/contrib/shoryuken/ext.rb +1 -0
  146. data/lib/datadog/tracing/contrib/sidekiq/configuration/settings.rb +1 -0
  147. data/lib/datadog/tracing/contrib/sidekiq/ext.rb +1 -0
  148. data/lib/datadog/tracing/contrib/sinatra/configuration/settings.rb +1 -0
  149. data/lib/datadog/tracing/contrib/sinatra/ext.rb +1 -0
  150. data/lib/datadog/tracing/contrib/sinatra/tracer.rb +6 -3
  151. data/lib/datadog/tracing/contrib/sneakers/configuration/settings.rb +1 -0
  152. data/lib/datadog/tracing/contrib/sneakers/ext.rb +1 -0
  153. data/lib/datadog/tracing/contrib/stripe/configuration/settings.rb +1 -0
  154. data/lib/datadog/tracing/contrib/stripe/ext.rb +1 -0
  155. data/lib/datadog/tracing/contrib/sucker_punch/configuration/settings.rb +1 -0
  156. data/lib/datadog/tracing/contrib/sucker_punch/ext.rb +1 -0
  157. data/lib/datadog/tracing/contrib/trilogy/configuration/settings.rb +58 -0
  158. data/lib/datadog/tracing/contrib/trilogy/ext.rb +27 -0
  159. data/lib/datadog/tracing/contrib/trilogy/instrumentation.rb +94 -0
  160. data/lib/datadog/tracing/contrib/trilogy/integration.rb +43 -0
  161. data/lib/datadog/tracing/contrib/trilogy/patcher.rb +31 -0
  162. data/lib/datadog/tracing/contrib.rb +1 -0
  163. data/lib/datadog/tracing/metadata/ext.rb +2 -0
  164. data/lib/datadog/tracing/trace_operation.rb +1 -2
  165. data/lib/datadog/tracing/transport/http.rb +1 -0
  166. data/lib/datadog/tracing/transport/trace_formatter.rb +31 -0
  167. data/lib/datadog/tracing.rb +8 -2
  168. data/lib/ddtrace/version.rb +1 -1
  169. metadata +62 -50
  170. data/ext/ddtrace_profiling_native_extension/pid_controller.c +0 -57
  171. data/ext/ddtrace_profiling_native_extension/pid_controller.h +0 -45
  172. data/lib/datadog/profiling/diagnostics/environment_logger.rb +0 -39
  173. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/NativeExtensionDesign.md +0 -0
  174. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/clock_id.h +0 -0
  175. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/clock_id_from_pthread.c +0 -0
  176. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/clock_id_noop.c +0 -0
  177. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_dynamic_sampling_rate.c +0 -0
  178. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_dynamic_sampling_rate.h +0 -0
  179. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_gc_profiling_helper.c +0 -0
  180. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_gc_profiling_helper.h +0 -0
  181. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_idle_sampling_helper.c +0 -0
  182. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_idle_sampling_helper.h +0 -0
  183. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_stack.c +0 -0
  184. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_stack.h +0 -0
  185. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/collectors_thread_context.h +0 -0
  186. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/heap_recorder.c +0 -0
  187. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/heap_recorder.h +0 -0
  188. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/libdatadog_helpers.c +0 -0
  189. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/libdatadog_helpers.h +0 -0
  190. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/setup_signal_handler.c +0 -0
  191. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/setup_signal_handler.h +0 -0
  192. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/stack_recorder.c +0 -0
  193. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/stack_recorder.h +0 -0
  194. /data/ext/{ddtrace_profiling_native_extension → datadog_profiling_native_extension}/time_helpers.c +0 -0
@@ -0,0 +1,422 @@
1
+ #include "collectors_discrete_dynamic_sampler.h"
2
+
3
+ #include <ruby.h>
4
+ #include "helpers.h"
5
+ #include "time_helpers.h"
6
+ #include "ruby_helpers.h"
7
+
8
+ #define BASE_OVERHEAD_PCT 1.0
9
+ #define BASE_SAMPLING_INTERVAL 50
10
+
11
+ #define ADJUSTMENT_WINDOW_NS SECONDS_AS_NS(1)
12
+ #define ADJUSTMENT_WINDOW_SAMPLES 100
13
+ // Any average sampling times above this value will be clamped to this value.
14
+ // In practice, this limits the budget consumption of a single sample to that of an adjustment window,
15
+ // thus aiming for a minimum sample rate of once per adjustment window (dependent on actual event rate).
16
+ // NOTE: This is our main strategy to deal with timing hiccups such as those that can be caused by
17
+ // suspensions, system overloads and other things that could lead to arbitrarily big sampling
18
+ // time measurements.
19
+ #define MAX_ALLOWED_SAMPLING_NS(target_overhead) (long) (ADJUSTMENT_WINDOW_NS * target_overhead / 100.)
20
+
21
+ #define EMA_SMOOTHING_FACTOR 0.6
22
+
23
+ void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns) {
24
+ sampler->debug_name = debug_name;
25
+ discrete_dynamic_sampler_set_overhead_target_percentage(sampler, BASE_OVERHEAD_PCT, now_ns);
26
+ }
27
+
28
+ void discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler, long now_ns) {
29
+ const char *debug_name = sampler->debug_name;
30
+ double target_overhead = sampler->target_overhead;
31
+ (*sampler) = (discrete_dynamic_sampler) {
32
+ .debug_name = debug_name,
33
+ .target_overhead = target_overhead,
34
+ // Act as if a reset is a readjustment (it kinda is!) and wait for a full adjustment window
35
+ // to compute stats. Otherwise, we'd readjust on the next event that comes and thus be operating
36
+ // with very incomplete information
37
+ .last_readjust_time_ns = now_ns,
38
+ // This fake readjustment will use a hardcoded sampling interval
39
+ .sampling_interval = BASE_SAMPLING_INTERVAL,
40
+ .sampling_probability = 1.0 / BASE_SAMPLING_INTERVAL,
41
+ .max_sampling_time_ns = MAX_ALLOWED_SAMPLING_NS(target_overhead),
42
+ // But we want to make sure we sample at least once in the next window so that our first
43
+ // real readjustment has some notion of how heavy sampling is. Therefore, we'll make it so that
44
+ // the next event is automatically sampled by artificially locating it in the interval threshold.
45
+ .events_since_last_sample = BASE_SAMPLING_INTERVAL - 1,
46
+ };
47
+ }
48
+
49
+ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sampler *sampler, double target_overhead, long now_ns) {
50
+ if (target_overhead <= 0 || target_overhead > 100) {
51
+ rb_raise(rb_eArgError, "Target overhead must be a double between ]0,100] was %f", target_overhead);
52
+ }
53
+ sampler->target_overhead = target_overhead;
54
+ return discrete_dynamic_sampler_reset(sampler, now_ns);
55
+ }
56
+
57
+ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now);
58
+
59
+ bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns) {
60
+ // For efficiency reasons we don't do true random sampling but rather systematic
61
+ // sampling following a sample interval/skip. This can be biased and hide patterns
62
+ // but the dynamic interval and rather indeterministic pattern of allocations in
63
+ // most real applications should help reduce the bias impact.
64
+ sampler->events_since_last_sample++;
65
+ sampler->events_since_last_readjustment++;
66
+ bool should_sample = sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
67
+
68
+ if (should_sample) {
69
+ sampler->sample_start_time_ns = now_ns;
70
+ } else {
71
+ // check if we should readjust our sampler after this event, even if we didn't sample it
72
+ maybe_readjust(sampler, now_ns);
73
+ }
74
+
75
+ return should_sample;
76
+ }
77
+
78
+ long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler, long now_ns) {
79
+ long last_sampling_time_ns = sampler->sample_start_time_ns == 0 ? 0 : long_max_of(0, now_ns - sampler->sample_start_time_ns);
80
+ sampler->samples_since_last_readjustment++;
81
+ sampler->sampling_time_since_last_readjustment_ns += last_sampling_time_ns;
82
+ sampler->events_since_last_sample = 0;
83
+
84
+ // check if we should readjust our sampler after this sample
85
+ maybe_readjust(sampler, now_ns);
86
+
87
+ return last_sampling_time_ns;
88
+ }
89
+
90
+ double discrete_dynamic_sampler_probability(discrete_dynamic_sampler *sampler) {
91
+ return sampler->sampling_probability * 100.;
92
+ }
93
+
94
+ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sampler *sampler) {
95
+ return sampler->events_since_last_sample;
96
+ }
97
+
98
+ static double ewma_adj_window(double latest_value, double avg, long current_window_time_ns, bool is_first) {
99
+ if (is_first) {
100
+ return latest_value;
101
+ }
102
+
103
+ // We don't want samples coming from partial adjustment windows (e.g. preempted due to number of samples)
104
+ // to lead to quick "forgetting" of the past. Thus, we'll tweak the weight of this new value based on the
105
+ // size of the time window from which we gathered it in relation to our standard adjustment window time.
106
+ double fraction_of_full_window = double_min_of((double) current_window_time_ns / ADJUSTMENT_WINDOW_NS, 1);
107
+ double alpha = EMA_SMOOTHING_FACTOR * fraction_of_full_window;
108
+
109
+ return (1-alpha) * avg + alpha * latest_value;
110
+ }
111
+
112
+ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
113
+ long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now - sampler->last_readjust_time_ns;
114
+
115
+ bool should_readjust_based_on_time = this_window_time_ns >= ADJUSTMENT_WINDOW_NS;
116
+ bool should_readjust_based_on_samples = sampler->samples_since_last_readjustment >= ADJUSTMENT_WINDOW_SAMPLES;
117
+
118
+ if (!should_readjust_based_on_time && !should_readjust_based_on_samples) {
119
+ // not enough time or samples have passed to perform a readjustment
120
+ return;
121
+ }
122
+
123
+ if (this_window_time_ns == 0) {
124
+ // should not be possible given previous condition but lets protect against div by 0 below.
125
+ return;
126
+ }
127
+
128
+ // If we got this far, lets recalculate our sampling params based on new observations
129
+ bool first_readjustment = !sampler->has_completed_full_adjustment_window;
130
+
131
+ // Update our running average of events/sec with latest observation.
132
+ sampler->events_per_ns = ewma_adj_window(
133
+ (double) sampler->events_since_last_readjustment / this_window_time_ns,
134
+ sampler->events_per_ns,
135
+ this_window_time_ns,
136
+ first_readjustment
137
+ );
138
+
139
+ // Update our running average of sampling time for a specific event
140
+ if (sampler->samples_since_last_readjustment > 0) {
141
+ // We can only update sampling-related stats if we actually sampled on the last window...
142
+
143
+ // Lets update our average sampling time per event
144
+ long avg_sampling_time_in_window_ns = sampler->samples_since_last_readjustment == 0 ? 0 : sampler->sampling_time_since_last_readjustment_ns / sampler->samples_since_last_readjustment;
145
+ if (avg_sampling_time_in_window_ns > sampler->max_sampling_time_ns) {
146
+ // If the average sampling time in the previous window was deemed unnacceptable, clamp it to the
147
+ // maximum acceptable value and register this operation in our counter.
148
+ // NOTE: This is important so that events like suspensions or system overloads do not lead us to
149
+ // learn arbitrarily big sampling times which may then result in us not sampling anything
150
+ // for very long periods of time.
151
+ avg_sampling_time_in_window_ns = sampler->max_sampling_time_ns;
152
+ sampler->sampling_time_clamps++;
153
+ }
154
+ sampler->sampling_time_ns = ewma_adj_window(
155
+ avg_sampling_time_in_window_ns,
156
+ sampler->sampling_time_ns,
157
+ this_window_time_ns,
158
+ first_readjustment
159
+ );
160
+ }
161
+
162
+ // Are we meeting our target in practice? If we're consistently overshooting our estimate due to non-uniform allocation patterns lets
163
+ // adjust our overhead target.
164
+ // NOTE: Updating this even when no samples occur is a conscious choice which enables us to cooldown extreme adjustments over time.
165
+ // If we didn't do this, whenever a big spike caused target_overhead_adjustment to equal target_overhead, we'd get stuck
166
+ // in a "probability = 0" state.
167
+ long this_window_sampling_target_time_ns = this_window_time_ns * (sampler->target_overhead / 100.);
168
+ // Overshoot by definition is always >= 0. < 0 would be undershooting!
169
+ long this_window_sampling_overshoot_time_ns = long_max_of(0, sampler->sampling_time_since_last_readjustment_ns - this_window_sampling_target_time_ns);
170
+ // Our overhead adjustment should always be between [-target_overhead, 0]. Higher adjustments would lead to negative overhead targets
171
+ // which don't make much sense.
172
+ double last_target_overhead_adjustment = -double_min_of(sampler->target_overhead, this_window_sampling_overshoot_time_ns * 100. / this_window_time_ns);
173
+ sampler->target_overhead_adjustment = ewma_adj_window(
174
+ last_target_overhead_adjustment,
175
+ sampler->target_overhead_adjustment,
176
+ this_window_time_ns,
177
+ first_readjustment
178
+ );
179
+
180
+ // Apply our overhead adjustment to figure out our real targets for this readjustment.
181
+ double target_overhead = double_max_of(0, sampler->target_overhead + sampler->target_overhead_adjustment);
182
+
183
+ // Recalculate target sampling probability so that the following 2 hold:
184
+ // * window_time_ns = working_window_time_ns + sampling_window_time_ns
185
+ // │ │ │
186
+ // │ │ └ how much time is spent sampling
187
+ // │ └── how much time is spent doing actual app stuff
188
+ // └── total (wall) time in this adjustment window
189
+ // * sampling_window_time_ns <= window_time_ns * target_overhead / 100
190
+ //
191
+ // Note that
192
+ //
193
+ // sampling_window_time_ns = samples_in_window * sampling_time_ns =
194
+ // ┌─ assuming no events will be emitted during sampling
195
+ // │
196
+ // = events_per_ns * working_window_time_ns * sampling_probability * sampling_time_ns
197
+ // = events_per_ns * (window_time_ns - sampling_window_time_ns) * sampling_probability * sampling_time_ns
198
+ //
199
+ // Re-ordering for sampling_probability and solving for the upper-bound of sampling_window_time_ns:
200
+ //
201
+ // sampling_window_time_ns = window_time_ns * target_overhead / 100
202
+ // sampling_probability = (sampling_window_time_ns) / (events_per_ns * sampling_time_ns * (window_time_ns - sampling_window_time_ns))
203
+ // = (window_time_ns * target_overhead / 100) / (events_per_ns * sampling_time_ns * window_time_ns * (1 - target_overhead / 100))
204
+ //
205
+ // Which you can intuitively understand as:
206
+ //
207
+ // sampling_probability = max_allowed_time_for_sampling_ns / time_to_sample_all_events_ns
208
+ //
209
+ // As a quick sanity check:
210
+ // * If app is eventing very little or we're sampling very fast, so that time_to_sample_all_events_ns < max_allowed_time_for_sampling_ns
211
+ // then probability will be > 1 (but we should clamp to 1 since probabilities higher than 1 don't make sense).
212
+ // * If app is eventing a lot or our sampling overhead is big, then as time_to_sample_all_events_ns grows, sampling_probability will
213
+ // tend to 0.
214
+ //
215
+ // In fact, we can simplify the equation further since the `window_time_ns` components cancel each other out:
216
+ //
217
+ // sampling_probability = (target_overhead / 100) / (events_per_ns * sampling_time_ns * (1 - target_overhead / 100))
218
+ // = max_sampling_overhead / avg_sampling_overhead
219
+
220
+ double max_sampling_overhead = target_overhead / 100.;
221
+ double avg_sampling_overhead = sampler->events_per_ns * sampler->sampling_time_ns * (1 - max_sampling_overhead);
222
+
223
+ if (max_sampling_overhead == 0) {
224
+ // if we aren't allowed any sampling overhead at all, probability has to be 0
225
+ sampler->sampling_probability = 0;
226
+ } else {
227
+ // otherwise apply the formula described above (protecting against div by 0)
228
+ sampler->sampling_probability = avg_sampling_overhead == 0 ? 1. :
229
+ double_min_of(1., max_sampling_overhead / avg_sampling_overhead);
230
+ }
231
+
232
+ // Doing true random selection would involve "tossing a coin" on every allocation. Lets do systematic sampling instead so that our
233
+ // sampling decision can rely solely on a sampling skip/interval (i.e. more efficient).
234
+ //
235
+ // sampling_interval = events / samples =
236
+ // = event_rate * working_window_time_ns / (event_rate * working_window_time_ns * sampling_probability)
237
+ // = 1 / sampling_probability
238
+ //
239
+ // NOTE: The sampling interval has to be an integer since we're dealing with discrete events here. This means that there'll be
240
+ // a loss of precision (and thus control) when adjusting between probabilities that lead to non-integer granularity
241
+ // changes (e.g. probabilities in the range of ]50%, 100%[ which map to intervals in the range of ]1, 2[). Our approach
242
+ // when the sampling interval is a non-integer is to ceil it (i.e. we'll always choose to sample less often).
243
+ // NOTE: Overhead target adjustments or very big sampling times can in theory bring probability so close to 0 as to effectively
244
+ // round down to full 0. This means we have to be careful to handle div-by-0 as well as resulting double intervals that
245
+ // are so big they don't fit into the sampling_interval. In both cases lets just disable sampling until next readjustment
246
+ // by setting interval to 0.
247
+ double sampling_interval = sampler->sampling_probability == 0 ? 0 : ceil(1.0 / sampler->sampling_probability);
248
+ sampler->sampling_interval = sampling_interval > ULONG_MAX ? 0 : sampling_interval;
249
+
250
+ #ifdef DD_DEBUG
251
+ double allocs_in_60s = sampler->events_per_ns * 1e9 * 60;
252
+ double samples_in_60s = allocs_in_60s * sampler->sampling_probability;
253
+ double expected_total_sampling_time_in_60s =
254
+ samples_in_60s * sampler->sampling_time_ns / 1e9;
255
+ double num_this_windows_in_60s = 60 * 1e9 / this_window_time_ns;
256
+ double real_total_sampling_time_in_60s = sampler->sampling_time_since_last_readjustment_ns * num_this_windows_in_60s / 1e9;
257
+
258
+ const char* readjustment_reason = should_readjust_based_on_time ? "time" : "samples";
259
+
260
+ fprintf(stderr, "[dds.%s] readjusting due to %s...\n", sampler->debug_name, readjustment_reason);
261
+ fprintf(stderr, "events_since_last_readjustment=%ld\n", sampler->events_since_last_readjustment);
262
+ fprintf(stderr, "samples_since_last_readjustment=%ld\n", sampler->samples_since_last_readjustment);
263
+ fprintf(stderr, "this_window_time=%ld\n", this_window_time_ns);
264
+ fprintf(stderr, "this_window_sampling_time=%ld\n", sampler->sampling_time_since_last_readjustment_ns);
265
+ fprintf(stderr, "this_working_window_time=%ld\n", this_window_time_ns - sampler->sampling_time_since_last_readjustment_ns);
266
+ fprintf(stderr, "this_window_sampling_target_time=%ld\n", this_window_sampling_target_time_ns);
267
+ fprintf(stderr, "this_window_sampling_overshoot_time=%ld\n", this_window_sampling_overshoot_time_ns);
268
+ fprintf(stderr, "\n");
269
+ fprintf(stderr, "target_overhead=%f\n", sampler->target_overhead);
270
+ fprintf(stderr, "target_overhead_adjustment=%f\n", sampler->target_overhead_adjustment);
271
+ fprintf(stderr, "events_per_sec=%f\n", sampler->events_per_ns * 1e9);
272
+ fprintf(stderr, "sampling_time=%ld\n", sampler->sampling_time_ns);
273
+ fprintf(stderr, "avg_sampling_overhead=%f\n", avg_sampling_overhead * 100);
274
+ fprintf(stderr, "sampling_interval=%zu\n", sampler->sampling_interval);
275
+ fprintf(stderr, "sampling_probability=%f\n", sampler->sampling_probability * 100);
276
+ fprintf(stderr, "\n");
277
+ fprintf(stderr, "expected allocs in 60s=%f\n", allocs_in_60s);
278
+ fprintf(stderr, "expected samples in 60s=%f\n", samples_in_60s);
279
+ fprintf(stderr, "expected sampling time in 60s=%f (previous real=%f)\n", expected_total_sampling_time_in_60s, real_total_sampling_time_in_60s);
280
+ fprintf(stderr, "expected max overhead in 60s=%f\n", target_overhead / 100.0 * 60);
281
+ fprintf(stderr, "\n");
282
+ fprintf(stderr, "sampling_time_clamps=%zu\n", sampler->sampling_time_clamps);
283
+ fprintf(stderr, "-------\n");
284
+ #endif
285
+
286
+ sampler->events_since_last_readjustment = 0;
287
+ sampler->samples_since_last_readjustment = 0;
288
+ sampler->sampling_time_since_last_readjustment_ns = 0;
289
+ sampler->last_readjust_time_ns = now;
290
+ sampler->has_completed_full_adjustment_window = true;
291
+ }
292
+
293
+ VALUE discrete_dynamic_sampler_state_snapshot(discrete_dynamic_sampler *sampler) {
294
+ VALUE arguments[] = {
295
+ ID2SYM(rb_intern("target_overhead")), /* => */ DBL2NUM(sampler->target_overhead),
296
+ ID2SYM(rb_intern("target_overhead_adjustment")), /* => */ DBL2NUM(sampler->target_overhead_adjustment),
297
+ ID2SYM(rb_intern("events_per_sec")), /* => */ DBL2NUM(sampler->events_per_ns * 1e9),
298
+ ID2SYM(rb_intern("sampling_time_ns")), /* => */ LONG2NUM(sampler->sampling_time_ns),
299
+ ID2SYM(rb_intern("sampling_interval")), /* => */ ULONG2NUM(sampler->sampling_interval),
300
+ ID2SYM(rb_intern("sampling_probability")), /* => */ DBL2NUM(sampler->sampling_probability * 100),
301
+ ID2SYM(rb_intern("events_since_last_readjustment")), /* => */ ULONG2NUM(sampler->events_since_last_readjustment),
302
+ ID2SYM(rb_intern("samples_since_last_readjustment")), /* => */ ULONG2NUM(sampler->samples_since_last_readjustment),
303
+ ID2SYM(rb_intern("max_sampling_time_ns")), /* => */ LONG2NUM(sampler->max_sampling_time_ns),
304
+ ID2SYM(rb_intern("sampling_time_clamps")), /* => */ ULONG2NUM(sampler->sampling_time_clamps),
305
+ };
306
+ VALUE hash = rb_hash_new();
307
+ for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(hash, arguments[i], arguments[i+1]);
308
+ return hash;
309
+ }
310
+
311
+ // ---
312
+ // Below here is boilerplate to expose the above code to Ruby so that we can test it with RSpec as usual.
313
+
314
+ static VALUE _native_new(VALUE klass);
315
+ static VALUE _native_initialize(VALUE self, VALUE now);
316
+ static VALUE _native_reset(VALUE self, VALUE now);
317
+ static VALUE _native_set_overhead_target_percentage(VALUE self, VALUE target_overhead, VALUE now);
318
+ static VALUE _native_should_sample(VALUE self, VALUE now);
319
+ static VALUE _native_after_sample(VALUE self, VALUE now);
320
+ static VALUE _native_state_snapshot(VALUE self);
321
+
322
+ typedef struct sampler_state {
323
+ discrete_dynamic_sampler sampler;
324
+ } sampler_state;
325
+
326
+ void collectors_discrete_dynamic_sampler_init(VALUE profiling_module) {
327
+ VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
328
+ VALUE discrete_sampler_module = rb_define_module_under(collectors_module, "DiscreteDynamicSampler");
329
+ VALUE testing_module = rb_define_module_under(discrete_sampler_module, "Testing");
330
+ VALUE sampler_class = rb_define_class_under(testing_module, "Sampler", rb_cObject);
331
+
332
+ rb_define_alloc_func(sampler_class, _native_new);
333
+ // NOTE: Despite being native, we're using the normal ruby keyword to prevent having to write a whole
334
+ // new ruby file to simply proxy the initialization call.
335
+ rb_define_method(sampler_class, "initialize", _native_initialize, 1);
336
+
337
+ rb_define_method(sampler_class, "_native_reset", _native_reset, 1);
338
+ rb_define_method(sampler_class, "_native_set_overhead_target_percentage", _native_set_overhead_target_percentage, 2);
339
+ rb_define_method(sampler_class, "_native_should_sample", _native_should_sample, 1);
340
+ rb_define_method(sampler_class, "_native_after_sample", _native_after_sample, 1);
341
+ rb_define_method(sampler_class, "_native_state_snapshot", _native_state_snapshot, 0);
342
+ }
343
+
344
+ static const rb_data_type_t sampler_typed_data = {
345
+ .wrap_struct_name = "Datadog::Profiling::DiscreteDynamicSampler::Testing::Sampler",
346
+ .function = {
347
+ .dfree = RUBY_DEFAULT_FREE,
348
+ .dsize = NULL,
349
+ },
350
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
351
+ };
352
+
353
+ static VALUE _native_new(VALUE klass) {
354
+ sampler_state *state = ruby_xcalloc(sizeof(sampler_state), 1);
355
+
356
+ long now_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
357
+ if (now_ns == 0) {
358
+ rb_raise(rb_eRuntimeError, "failed to get clock time");
359
+ }
360
+ discrete_dynamic_sampler_init(&state->sampler, "test sampler", now_ns);
361
+
362
+ return TypedData_Wrap_Struct(klass, &sampler_typed_data, state);
363
+ }
364
+
365
+ static VALUE _native_initialize(VALUE self, VALUE now_ns) {
366
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
367
+
368
+ sampler_state *state;
369
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
370
+
371
+ discrete_dynamic_sampler_init(&state->sampler, "test sampler", NUM2LONG(now_ns));
372
+
373
+ return Qtrue;
374
+ }
375
+
376
+ static VALUE _native_reset(VALUE self, VALUE now_ns) {
377
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
378
+
379
+ sampler_state *state;
380
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
381
+
382
+ discrete_dynamic_sampler_reset(&state->sampler, NUM2LONG(now_ns));
383
+
384
+ return Qnil;
385
+ }
386
+
387
+ static VALUE _native_set_overhead_target_percentage(VALUE self, VALUE target_overhead, VALUE now_ns) {
388
+ ENFORCE_TYPE(target_overhead, T_FLOAT);
389
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
390
+
391
+ sampler_state *state;
392
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
393
+
394
+ discrete_dynamic_sampler_set_overhead_target_percentage(&state->sampler, NUM2DBL(target_overhead), NUM2LONG(now_ns));
395
+
396
+ return Qnil;
397
+ }
398
+
399
+ VALUE _native_should_sample(VALUE self, VALUE now_ns) {
400
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
401
+
402
+ sampler_state *state;
403
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
404
+
405
+ return discrete_dynamic_sampler_should_sample(&state->sampler, NUM2LONG(now_ns)) ? Qtrue : Qfalse;
406
+ }
407
+
408
+ VALUE _native_after_sample(VALUE self, VALUE now_ns) {
409
+ ENFORCE_TYPE(now_ns, T_FIXNUM);
410
+
411
+ sampler_state *state;
412
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
413
+
414
+ return LONG2NUM(discrete_dynamic_sampler_after_sample(&state->sampler, NUM2LONG(now_ns)));
415
+ }
416
+
417
+ VALUE _native_state_snapshot(VALUE self) {
418
+ sampler_state *state;
419
+ TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
420
+
421
+ return discrete_dynamic_sampler_state_snapshot(&state->sampler);
422
+ }
@@ -0,0 +1,101 @@
1
+ #pragma once
2
+
3
+ #include <stdbool.h>
4
+ #include <stddef.h>
5
+
6
+ #include <ruby.h>
7
+
8
+ // A sampler that will sample discrete events based on the overhead of their
9
+ // sampling.
10
+ //
11
+ // NOTE: For performance reasons, this sampler does systematic sampling via
12
+ // sampling intervals/skips that are dynamically adjusted over time.
13
+ // It will not perform truly random sampling by "throwing a coin" at
14
+ // every event and is thus, in theory, susceptible to some pattern
15
+ // biases. In practice, the dynamic readjustment of sampling interval
16
+ // and randomized starting point should help with avoiding heavy biases.
17
+ typedef struct discrete_dynamic_sampler {
18
+ // --- Config ---
19
+ // Name of this sampler for debug logs.
20
+ const char *debug_name;
21
+ // Value in the range ]0, 100] representing the % of time we're willing to dedicate
22
+ // to sampling.
23
+ double target_overhead;
24
+
25
+ // -- Reference State ---
26
+ // Moving average of how many events per ns we saw over the recent past.
27
+ double events_per_ns;
28
+ // Moving average of the sampling time of each individual event.
29
+ long sampling_time_ns;
30
+ // Sampling probability being applied by this sampler.
31
+ double sampling_probability;
32
+ // Sampling interval/skip that drives the systematic sampling done by this sampler.
33
+ // NOTE: This is an inverted view of the probability.
34
+ // NOTE: A value of 0 works as +inf, effectively disabling sampling (to align with probability=0)
35
+ unsigned long sampling_interval;
36
+ // Max allowed value for an individual sampling time measurement.
37
+ long max_sampling_time_ns;
38
+
39
+ // -- Sampling State --
40
+ // How many events have we seen since we last decided to sample.
41
+ unsigned long events_since_last_sample;
42
+ // Captures the time at which the last true-returning call to should_sample happened.
43
+ // This is used in after_sample to understand the total sample time.
44
+ long sample_start_time_ns;
45
+
46
+ // -- Adjustment State --
47
+ // Has this sampler already ran for at least one complete adjustment window?
48
+ bool has_completed_full_adjustment_window;
49
+ // Time at which we last readjust our sampling parameters.
50
+ long last_readjust_time_ns;
51
+ // How many events have we seen since the last readjustment.
52
+ unsigned long events_since_last_readjustment;
53
+ // How many samples have we seen since the last readjustment.
54
+ unsigned long samples_since_last_readjustment;
55
+ // How much time have we spent sampling since the last readjustment.
56
+ unsigned long sampling_time_since_last_readjustment_ns;
57
+ // A negative number that we add to target_overhead to serve as extra padding to
58
+ // try and mitigate observed overshooting of max sampling time.
59
+ double target_overhead_adjustment;
60
+
61
+ // -- Interesting stats --
62
+ unsigned long sampling_time_clamps;
63
+ } discrete_dynamic_sampler;
64
+
65
+
66
+ // Init a new sampler with sane defaults.
67
+ void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns);
68
+
69
+ // Reset a sampler, clearing all stored state.
70
+ void discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler, long now_ns);
71
+
72
+ // Sets a new target_overhead for the provided sampler, resetting it in the process.
73
+ // @param target_overhead A double representing the percentage of total time we are
74
+ // willing to use as overhead for the resulting sampling. Values are expected
75
+ // to be in the range ]0.0, 100.0].
76
+ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sampler *sampler, double target_overhead, long now_ns);
77
+
78
+ // Make a sampling decision.
79
+ //
80
+ // @return True if the event associated with this decision should be sampled, false
81
+ // otherwise.
82
+ //
83
+ // NOTE: If true is returned we implicitly assume the start of a sampling operation
84
+ // and it is expected that a follow-up after_sample call is issued.
85
+ bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns);
86
+
87
+ // Signal the end of a sampling operation.
88
+ //
89
+ // @return Sampling time in nanoseconds for the sample operation we just finished.
90
+ long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler, long now_ns);
91
+
92
+ // Retrieve the current sampling probability ([0.0, 100.0]) being applied by this sampler.
93
+ double discrete_dynamic_sampler_probability(discrete_dynamic_sampler *sampler);
94
+
95
+ // Retrieve the current number of events seen since last sample.
96
+ unsigned long discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sampler *sampler);
97
+
98
+ // Return a Ruby hash containing a snapshot of this sampler's interesting state at calling time.
99
+ // WARN: This allocates in the Ruby VM and therefore should not be called without the
100
+ // VM lock or during GC.
101
+ VALUE discrete_dynamic_sampler_state_snapshot(discrete_dynamic_sampler *sampler);